From bb244d736b3db516a9068f1682bcb82ee4200970 Mon Sep 17 00:00:00 2001 From: Licai Guo Date: Tue, 4 Mar 2014 16:49:34 +0800 Subject: [PATCH] Partly add arm asm code to encoder. --- .../common/common.xcodeproj/project.pbxproj | 4 + .../welsenc/welsenc.xcodeproj/project.pbxproj | 32 + codec/common/deblocking_neon.S | 228 ++ codec/common/expand_picture.S | 137 ++ codec/common/expand_picture_common.h | 4 + codec/encoder/core/arm/.DS_Store | Bin 0 -> 6148 bytes codec/encoder/core/arm/intra_pred_neon.S | 648 ++++++ .../core/arm/intra_pred_sad_3_opt_neon.S | 793 +++++++ codec/encoder/core/arm/mc_neon.S | 1963 +++++++++++++++++ codec/encoder/core/arm/memory_neon.S | 63 + codec/encoder/core/arm/pixel_neon.S | 880 ++++++++ codec/encoder/core/arm/reconstruct_neon.S | 1312 +++++++++++ codec/encoder/core/inc/deblocking.h | 10 + codec/encoder/core/inc/decode_mb_aux.h | 10 + codec/encoder/core/inc/encode_mb_aux.h | 22 + codec/encoder/core/inc/encoder.h | 2 + codec/encoder/core/src/deblocking.cpp | 40 +- codec/encoder/core/src/decode_mb_aux.cpp | 12 + codec/encoder/core/src/encode_mb_aux.cpp | 30 + codec/encoder/core/src/encoder.cpp | 8 + codec/encoder/core/src/encoder_ext.cpp | 7 + codec/encoder/core/src/expand_pic.cpp | 7 + .../iOS/processing.xcodeproj/project.pbxproj | 34 + codec/processing/src/arm/.DS_Store | Bin 0 -> 6148 bytes .../src/arm/adaptive_quantization.S | 120 + codec/processing/src/arm/down_sample_neon.S | 342 +++ codec/processing/src/arm/pixel_sad_neon.S | 68 + codec/processing/src/arm/vaa_calc_neon.S | 1143 ++++++++++ 28 files changed, 7916 insertions(+), 3 deletions(-) create mode 100755 codec/common/expand_picture.S create mode 100644 codec/encoder/core/arm/.DS_Store create mode 100755 codec/encoder/core/arm/intra_pred_neon.S create mode 100755 codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S create mode 100755 codec/encoder/core/arm/mc_neon.S create mode 100755 codec/encoder/core/arm/memory_neon.S create mode 100755 codec/encoder/core/arm/pixel_neon.S create mode 100755 codec/encoder/core/arm/reconstruct_neon.S create mode 100644 codec/processing/src/arm/.DS_Store create mode 100755 codec/processing/src/arm/adaptive_quantization.S create mode 100755 codec/processing/src/arm/down_sample_neon.S create mode 100755 codec/processing/src/arm/pixel_sad_neon.S create mode 100755 codec/processing/src/arm/vaa_calc_neon.S diff --git a/codec/build/iOS/common/common.xcodeproj/project.pbxproj b/codec/build/iOS/common/common.xcodeproj/project.pbxproj index 5926d8cc..13d72ed6 100644 --- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj +++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj @@ -7,6 +7,7 @@ objects = { /* Begin PBXBuildFile section */ + 4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture.S */; }; 4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; }; 4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; }; 4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; }; @@ -46,6 +47,7 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 4C34067C18C5C94C00DFA14A /* expand_picture.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture.S; sourceTree = ""; }; 4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; }; 4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -148,6 +150,7 @@ 4CE4472F18BC61650017DF25 /* common */ = { isa = PBXGroup; children = ( + 4C34067C18C5C94C00DFA14A /* expand_picture.S */, 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */, 4CE447BC18C085320017DF25 /* deblocking_neon.S */, 4CE4473118BC61650017DF25 /* cpu.cpp */, @@ -257,6 +260,7 @@ 4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */, 4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */, 4CE4475218BC61650017DF25 /* logging.cpp in Sources */, + 4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */, 4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */, 4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */, 4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */, diff --git a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj index 6f126bf9..43f03d74 100644 --- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj +++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj @@ -7,6 +7,12 @@ objects = { /* Begin PBXBuildFile section */ + 4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; }; + 4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; }; + 4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066818C57D0400DFA14A /* mc_neon.S */; }; + 4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; }; + 4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; }; + 4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; }; 4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; }; 4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; }; 4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; }; @@ -71,6 +77,12 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = ""; }; + 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = ""; }; + 4C34066818C57D0400DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = ""; }; + 4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = ""; }; + 4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = ""; }; + 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = ""; }; 4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; }; 4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -189,6 +201,19 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ + 4C34066418C57D0400DFA14A /* arm */ = { + isa = PBXGroup; + children = ( + 4C34066618C57D0400DFA14A /* intra_pred_neon.S */, + 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */, + 4C34066818C57D0400DFA14A /* mc_neon.S */, + 4C34066918C57D0400DFA14A /* memory_neon.S */, + 4C34066A18C57D0400DFA14A /* pixel_neon.S */, + 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */, + ); + path = arm; + sourceTree = ""; + }; 4CE4430818B6FFA00017DF25 = { isa = PBXGroup; children = ( @@ -249,6 +274,7 @@ 4CE446A118BC605B0017DF25 /* core */ = { isa = PBXGroup; children = ( + 4C34066418C57D0400DFA14A /* arm */, 4CE446A918BC605C0017DF25 /* inc */, 4CE446DC18BC605C0017DF25 /* src */, ); @@ -466,14 +492,18 @@ 4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */, 4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */, 4CE4472018BC605C0017DF25 /* sample.cpp in Sources */, + 4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */, 4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */, 4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */, + 4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */, 4CE4471F18BC605C0017DF25 /* ref_list_mgr_svc.cpp in Sources */, 4CE4472218BC605C0017DF25 /* slice_multi_threading.cpp in Sources */, 4CE4471518BC605C0017DF25 /* expand_pic.cpp in Sources */, + 4C34067018C57D0400DFA14A /* memory_neon.S in Sources */, 4CE4470F18BC605C0017DF25 /* deblocking.cpp in Sources */, 4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */, 4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */, + 4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */, 4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */, 4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */, 4CE4471718BC605C0017DF25 /* mc.cpp in Sources */, @@ -484,12 +514,14 @@ 4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */, 4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */, 4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */, + 4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */, 4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */, 4CE4472618BC605C0017DF25 /* svc_encode_slice.cpp in Sources */, 4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */, 4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */, 4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */, 4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */, + 4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/codec/common/deblocking_neon.S b/codec/common/deblocking_neon.S index f1b038dd..176c641e 100644 --- a/codec/common/deblocking_neon.S +++ b/codec/common/deblocking_neon.S @@ -809,4 +809,232 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon vst1.64 {d0-d2}, [r0] WELS_ASM_FUNC_END +#ifdef APPLE_IOS + +.macro BS_NZC_CHECK + vld1.8 {d0,d1}, [$0] + /* Arrenge the input data --- TOP */ + ands r6, $1, #2 + beq bs_nzc_check_jump0 + + sub r6, $0, $2, lsl #4 + sub r6, $2, lsl #3 + add r6, #12 + vld1.32 d3[1], [r6] + +bs_nzc_check_jump0: + vext.8 q1, q1, q0, #12 + vadd.u8 $3, q0, q1 + + + /* Arrenge the input data --- LEFT */ + ands r6, $1, #1 + beq bs_nzc_check_jump1 + + sub r6, $0, #21 + add r7, r6, #4 + vld1.8 d3[4], [r6] + add r6, r7, #4 + vld1.8 d3[5], [r7] + add r7, r6, #4 + vld1.8 d3[6], [r6] + vld1.8 d3[7], [r7] + +bs_nzc_check_jump1: + vzip.8 d0, d1 + vzip.8 d0, d1 + vext.8 q1, q1, q0, #12 + vadd.u8 $4, q0, q1 +.endm + +.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6 + mov r6, #4 + vabd.s16 q5, $0, $1 + vabd.s16 q6, $1, $2 + vdup.s16 $0, r6 + vabd.s16 q7, $2, $3 + vabd.s16 q8, $3, $4 + + vcge.s16 q5, $0 + vcge.s16 q6, $0 + vcge.s16 q7, $0 + vcge.s16 q8, $0 + + vpadd.i16 d10, d10, d11 + vpadd.i16 d11, d12, d13 + vpadd.i16 d12, d14, d15 + vpadd.i16 d13, d16, d17 + + vaddhn.i16 $5, q5, q5 + vaddhn.i16 $6, q6, q6 +.endm + +.macro BS_MV_CHECK + vldm $0, {q0,q1,q2,q3} + + /* Arrenge the input data --- TOP */ + ands r6, $1, #2 + beq bs_mv_check_jump0 + + sub r6, $0, $2, lsl #6 + add r6, #48 + vld1.8 {d8, d9}, [r6] + +bs_mv_check_jump0: + BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4 + + /* Arrenge the input data --- LEFT */ + ands r6, $1, #1 + beq bs_mv_check_jump1 + + sub r6, $0, #52 + add r7, r6, #16 + vld1.32 d8[0], [r6] + add r6, r7, #16 + vld1.32 d8[1], [r7] + add r7, r6, #16 + vld1.32 d9[0], [r6] + vld1.32 d9[1], [r7] + +bs_mv_check_jump1: + vzip.32 q0, q2 + vzip.32 q1, q3 + vzip.32 q0, q1 + vzip.32 q2, q3 + BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6 +.endm +#else + +.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4 + vld1.8 {d0,d1}, [\arg0] + /* Arrenge the input data --- TOP */ + ands r6, \arg1, #2 + beq bs_nzc_check_jump0 + + sub r6, \arg0, \arg2, lsl #4 + sub r6, \arg2, lsl #3 + add r6, #12 + vld1.32 d3[1], [r6] + + bs_nzc_check_jump0: + vext.8 q1, q1, q0, #12 + vadd.u8 \arg3, q0, q1 + + + /* Arrenge the input data --- LEFT */ + ands r6, \arg1, #1 + beq bs_nzc_check_jump1 + + sub r6, \arg0, #21 + add r7, r6, #4 + vld1.8 d3[4], [r6] + add r6, r7, #4 + vld1.8 d3[5], [r7] + add r7, r6, #4 + vld1.8 d3[6], [r6] + vld1.8 d3[7], [r7] + + bs_nzc_check_jump1: + vzip.8 d0, d1 + vzip.8 d0, d1 + vext.8 q1, q1, q0, #12 + vadd.u8 \arg4, q0, q1 +.endm + +.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6 + mov r6, #4 + vabd.s16 q5, \arg0, \arg1 + vabd.s16 q6, \arg1, \arg2 + vdup.s16 \arg0, r6 + vabd.s16 q7, \arg2, \arg3 + vabd.s16 q8, \arg3, \arg4 + + vcge.s16 q5, \arg0 + vcge.s16 q6, \arg0 + vcge.s16 q7, \arg0 + vcge.s16 q8, \arg0 + + vpadd.i16 d10, d10, d11 + vpadd.i16 d11, d12, d13 + vpadd.i16 d12, d14, d15 + vpadd.i16 d13, d16, d17 + + vaddhn.i16 \arg5, q5, q5 + vaddhn.i16 \arg6, q6, q6 +.endm + +.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6 + vldm \arg0, {q0,q1,q2,q3} + + /* Arrenge the input data --- TOP */ + ands r6, \arg1, #2 + beq bs_mv_check_jump0 + + sub r6, \arg0, \arg2, lsl #6 + add r6, #48 + vld1.8 {d8, d9}, [r6] + + bs_mv_check_jump0: + BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4 + + /* Arrenge the input data --- LEFT */ + ands r6, \arg1, #1 + beq bs_mv_check_jump1 + + sub r6, \arg0, #52 + add r7, r6, #16 + vld1.32 d8[0], [r6] + add r6, r7, #16 + vld1.32 d8[1], [r7] + add r7, r6, #16 + vld1.32 d9[0], [r6] + vld1.32 d9[1], [r7] + + bs_mv_check_jump1: + vzip.32 q0, q2 + vzip.32 q1, q3 + vzip.32 q0, q1 + vzip.32 q2, q3 + BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6 +.endm +#endif + + +WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon + + stmdb sp!, {r5-r7} + + ldr r5, [sp, #12] //Save BS to r5 + + /* Checking the nzc status */ + BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status + + /* For checking bS[I] = 2 */ + mov r6, #2 + vcgt.s8 q14, q14, #0 + vdup.u8 q0, r6 + vcgt.s8 q15, q15, #0 + + vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top + vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left + + /* Checking the mv status*/ + BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status + + /* For checking bS[I] = 1 */ + mov r6, #1 + vdup.u8 q0, r6 + + vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top + vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left + + + /* Check bS[I] is '1' or '2' */ + vmax.u8 q1, q12, q14 + vmax.u8 q0, q13, q15 + + //vstm r5, {q0, q1} + vst1.32 {q0, q1}, [r5] + ldmia sp!, {r5-r7} +WELS_ASM_FUNC_END #endif diff --git a/codec/common/expand_picture.S b/codec/common/expand_picture.S new file mode 100755 index 00000000..a0425dfd --- /dev/null +++ b/codec/common/expand_picture.S @@ -0,0 +1,137 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + + +WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon + stmdb sp!, {r4-r8} + //Save the dst + mov r7, r0 + mov r8, r3 + + add r4, r7, r2 + sub r4, #1 + //For the left and right expand +_expand_picture_luma_loop2: + sub r5, r7, #32 + add r6, r4, #1 + + vld1.8 {d0[], d1[]}, [r7], r1 + vld1.8 {d2[], d3[]}, [r4], r1 + + vst1.8 {q0}, [r5]! + vst1.8 {q0}, [r5] + vst1.8 {q1}, [r6]! + vst1.8 {q1}, [r6] + subs r8, #1 + bne _expand_picture_luma_loop2 + + //for the top and bottom expand + add r2, #64 + sub r0, #32 + mla r4, r1, r3, r0 + sub r4, r1 +_expand_picture_luma_loop0: + mov r5, #32 + mls r5, r5, r1, r0 + add r6, r4, r1 + vld1.8 {q0}, [r0]! + vld1.8 {q1}, [r4]! + + mov r8, #32 +_expand_picture_luma_loop1: + vst1.8 {q0}, [r5], r1 + vst1.8 {q1}, [r6], r1 + subs r8, #1 + bne _expand_picture_luma_loop1 + + subs r2, #16 + bne _expand_picture_luma_loop0 + + //vldreq.32 d0, [r0] + + ldmia sp!, {r4-r8} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon + stmdb sp!, {r4-r8} + //Save the dst + mov r7, r0 + mov r8, r3 + + add r4, r7, r2 + sub r4, #1 + //For the left and right expand +_expand_picture_chroma_loop2: + sub r5, r7, #16 + add r6, r4, #1 + + vld1.8 {d0[], d1[]}, [r7], r1 + vld1.8 {d2[], d3[]}, [r4], r1 + + vst1.8 {q0}, [r5] + vst1.8 {q1}, [r6] + subs r8, #1 + bne _expand_picture_chroma_loop2 + + //for the top and bottom expand + add r2, #32 + sub r0, #16 + mla r4, r1, r3, r0 + sub r4, r1 +_expand_picture_chroma_loop0: + mov r5, #16 + mls r5, r5, r1, r0 + add r6, r4, r1 + vld1.8 {q0}, [r0]! + vld1.8 {q1}, [r4]! + + mov r8, #16 +_expand_picture_chroma_loop1: + vst1.8 {q0}, [r5], r1 + vst1.8 {q1}, [r6], r1 + subs r8, #1 + bne _expand_picture_chroma_loop1 + + subs r2, #16 + bne _expand_picture_chroma_loop0 + + //vldreq.32 d0, [r0] + + ldmia sp!, {r4-r8} +WELS_ASM_FUNC_END + +#endif \ No newline at end of file diff --git a/codec/common/expand_picture_common.h b/codec/common/expand_picture_common.h index e1748c91..f205abef 100644 --- a/codec/common/expand_picture_common.h +++ b/codec/common/expand_picture_common.h @@ -61,6 +61,10 @@ void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst, const int32_t kiPicH); #endif//X86_ASM +#if defined(HAVE_NEON) +void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH); +void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH); +#endif #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/encoder/core/arm/.DS_Store b/codec/encoder/core/arm/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0> 5;" + vmla.s16 q2, q0, q3 + vmla.s16 q2, q1, d6[0] + vqrshrun.s16 d0, q2, #5 + + //Set a line of chroma MB + vst1.u32 {d0}, [r0]! + + //Do the same processing for each line. + mov r3, #7 +loop_0_get_i_chroma_pred_plane: + vadd.s16 q2, q1 + vqrshrun.s16 d0, q2, #5 + vst1.u32 {d0}, [r0]! + subs r3, #1 + bne loop_0_get_i_chroma_pred_plane + +WELS_ASM_FUNC_END + +#endif diff --git a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S new file mode 100755 index 00000000..8cc9e7ef --- /dev/null +++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S @@ -0,0 +1,793 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + + +#ifdef APPLE_IOS + //The data sequence will be used +.macro GET_8BYTE_DATA_L0 + vld1.8 {$0[0]}, [$1], $2 + vld1.8 {$0[1]}, [$1], $2 + vld1.8 {$0[2]}, [$1], $2 + vld1.8 {$0[3]}, [$1], $2 + vld1.8 {$0[4]}, [$1], $2 + vld1.8 {$0[5]}, [$1], $2 + vld1.8 {$0[6]}, [$1], $2 + vld1.8 {$0[7]}, [$1], $2 +.endm + + +.macro HDM_TRANSFORM_4X4_L0 + + //Do the vertical transform + vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13} + vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15} + vswp d1, d2 + vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} + vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} + + //Do the horizontal transform + vtrn.32 q2, q1 + vadd.s16 q0, q2, q1 + vsub.s16 q1, q2, q1 + + vtrn.16 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + + vmov.s16 d0, d4 + vmov.s16 d1, d2 + + vabs.s16 d3, d3 + + //16x16_v + vtrn.32 d0, d1 //{0,1,3,2} + vaba.s16 $5, d0, $2 //16x16_v + vaba.s16 $5, d1, $8 + vaba.s16 $5, d5, $8 + vadd.u16 $5, d3 + + //16x16_h + vtrn.16 d4, d5 //{0,4,12,8} + vaba.s16 $6, d4, $3 //16x16_h + vabs.s16 d2, d2 + vabs.s16 d5, d5 + vadd.u16 d2, d3 + vadd.u16 d2, d5 + vadd.u16 $6, d2 + + //16x16_dc_both + vaba.s16 $7, d4, $4 //16x16_dc_both + vadd.u16 $7, d2 + +.endm + +#else + //The data sequence will be used +.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2 + vld1.8 {\arg0[0]}, [\arg1], \arg2 + vld1.8 {\arg0[1]}, [\arg1], \arg2 + vld1.8 {\arg0[2]}, [\arg1], \arg2 + vld1.8 {\arg0[3]}, [\arg1], \arg2 + vld1.8 {\arg0[4]}, [\arg1], \arg2 + vld1.8 {\arg0[5]}, [\arg1], \arg2 + vld1.8 {\arg0[6]}, [\arg1], \arg2 + vld1.8 {\arg0[7]}, [\arg1], \arg2 +.endm + +.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8 + + //Do the vertical transform + vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13} + vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15} + vswp d1, d2 + vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} + vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} + + //Do the horizontal transform + vtrn.32 q2, q1 + vadd.s16 q0, q2, q1 + vsub.s16 q1, q2, q1 + + vtrn.16 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + + vmov.s16 d0, d4 + vmov.s16 d1, d2 + + vabs.s16 d3, d3 + + //16x16_v + vtrn.32 d0, d1 //{0,1,3,2} + vaba.s16 \arg5, d0, \arg2 //16x16_v + vaba.s16 \arg5, d1, \arg8 + vaba.s16 \arg5, d5, \arg8 + vadd.u16 \arg5, d3 + + //16x16_h + vtrn.16 d4, d5 //{0,4,12,8} + vaba.s16 \arg6, d4, \arg3 //16x16_h + vabs.s16 d2, d2 + vabs.s16 d5, d5 + vadd.u16 d2, d3 + vadd.u16 d2, d5 + vadd.u16 \arg6, d2 + + //16x16_dc_both + vaba.s16 \arg7, d4, \arg4 //16x16_dc_both + vadd.u16 \arg7, d2 +.endm +#endif + +WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon + stmdb sp!, {r4-r7, lr} + + //Get the top line data to 'q15'(16 bytes) + sub r7, r0, r1 + vld1.8 {q15}, [r7] + + //Get the left colume data to 'q14' (16 bytes) + sub r7, r0, #1 + GET_8BYTE_DATA_L0 d28, r7, r1 + GET_8BYTE_DATA_L0 d29, r7, r1 + + //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes) + //Calculate the 16x16_dc_both mode SATD + vaddl.u8 q0, d30, d31 + vaddl.u8 q1, d28, d29 + vadd.u16 q0, q1 + vadd.u16 d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + //Calculate the mean value + vrshr.u16 d0, #5 + vshl.u16 d27, d0, #4 + + + //Calculate the 16x16_v mode SATD and save to "q11, 12" + vshll.u8 q0, d30, #2 + vshll.u8 q1, d31, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q12, q2, q1 + vsub.s16 q11, q2, q1 + vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12 + //{8,9,11,10, 12,13,15,14} q11 + //Calculate the 16x16_h mode SATD and save to "q9, q10" + vshll.u8 q0, d28, #2 + vshll.u8 q1, d29, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q10, q2, q1 + vsub.s16 q9, q2, q1 + vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10 + //{8,9,11,10, 12,13,15,14} q9 + + vmov.i32 d17, #0//Save the SATD of DC_BOTH + vmov.i32 d16, #0//Save the SATD of H + vmov.i32 d15, #0//Save the SATD of V + vmov.i32 d14, #0//For zero D register + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14 + + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14 + + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14 + + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14 + + //Get the data from stack + ldr r5, [sp, #20] //the addr of Best_mode + ldr r6, [sp, #24] //the value of i_lambda + + //vadd.u16 d24, d25 + vrshr.u16 d15, #1 + vpaddl.u16 d15, d15 + vpaddl.u32 d15, d15 + vmov.u32 r0, d15[0] + + //vadd.u16 d22, d23 + vrshr.u16 d16, #1 + vpaddl.u16 d16, d16 + vpaddl.u32 d16, d16 + vmov.u32 r1, d16[0] + add r1, r6, lsl #1 + + //vadd.u16 d20, d21 + vrshr.u16 d17, #1 + vpaddl.u16 d17, d17 + vpaddl.u32 d17, d17 + vmov.u32 r2, d17[0] + add r2, r6, lsl #1 + + mov r4, #0 + cmp r1, r0 + movcc r0, r1 + movcc r4, #1 + cmp r2, r0 + movcc r0, r2 + movcc r4, #2 + + str r4, [r5] + + ldmia sp!, {r4-r7, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon + stmdb sp!, {r4-r7, lr} + + //Get the top line data to 'q15'(16 bytes) + sub r4, r0, r1 + vld1.8 {q15}, [r4] + + //Get the left colume data to 'q14' (16 bytes) + sub r4, r0, #1 + GET_8BYTE_DATA_L0 d28, r4, r1 + GET_8BYTE_DATA_L0 d29, r4, r1 + + //Calculate the mean value and save to 'q13' (8 bytes) + //Calculate the 16x16_dc_both mode SATD + vaddl.u8 q0, d30, d31 + vaddl.u8 q1, d28, d29 + vadd.u16 q0, q1 + vadd.u16 d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + //Calculate the mean value + vrshr.u16 d0, d0, #5 + vdup.8 q13, d0[0] + + sub r4, r0, #1 + + vmov.i32 q12, #0//Save the SATD of DC_BOTH + vmov.i32 q11, #0//Save the SATD of H + vmov.i32 q10, #0//Save the SATD of V + + mov lr, #16 +sad_intra_16x16_x3_opt_loop0: + //Get the left colume data to 'd0' (16 bytes) + vld1.8 {d0[]}, [r4], r1 + + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + vld1.8 {q1}, [r2], r3 + + subs lr, #1 + //Do the SAD for top colume + vabal.u8 q12, d30, d2 + vabal.u8 q12, d31, d3 + + //Do the SAD for left colume + vabal.u8 q11, d0, d2 + vabal.u8 q11, d0, d3 + + //Do the SAD for mean value + vabal.u8 q10, d26, d2 + vabal.u8 q10, d26, d3 + + bne sad_intra_16x16_x3_opt_loop0 + + //Get the data from stack + ldr r5, [sp, #20] //the addr of Best_mode + ldr r6, [sp, #24] //the value of i_lambda + + vadd.u16 d24, d25 + vpaddl.u16 d24, d24 + vpaddl.u32 d24, d24 + vmov.u32 r0, d24[0] + + vadd.u16 d22, d23 + vpaddl.u16 d22, d22 + vpaddl.u32 d22, d22 + vmov.u32 r1, d22[0] + add r1, r6, lsl #1 + + vadd.u16 d20, d21 + vpaddl.u16 d20, d20 + vpaddl.u32 d20, d20 + vmov.u32 r2, d20[0] + add r2, r6, lsl #1 + + mov r4, #0 + cmp r1, r0 + movcc r0, r1 + movcc r4, #1 + cmp r2, r0 + movcc r0, r2 + movcc r4, #2 + + str r4, [r5] + + ldmia sp!, {r4-r7, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon + stmdb sp!, {r4-r7, lr} + + //Get the data from stack + ldr r4, [sp, #32] //p_dec_cr + ldr r5, [sp, #36] //p_enc_cr + + //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) + sub r6, r0, #1 + GET_8BYTE_DATA_L0 d28, r6, r1 + sub r6, r4, #1 + GET_8BYTE_DATA_L0 d30, r6, r1 + + //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) + sub r6, r0, r1 + vld1.8 {d29}, [r6] + sub r6, r4, r1 + vld1.8 {d31}, [r6] + + //Calculate the sum of left column and top row + vmov.i32 q0, q14 + vpaddl.u8 q0, q0 + vpaddl.u16 q0, q0 + vadd.u32 d2, d0, d1 //'m1' save to d2 + vrshr.u32 q0, q0, #2 //calculate 'm2','m3' + vrshr.u32 d2, d2, #3 //calculate 'm4' + + //duplicate the 'mx' to a vector line + vdup.8 d27, d2[0] + vdup.8 d26, d1[4] + vtrn.32 d27, d26 + + vdup.8 d26, d0[4] + vdup.8 d25, d2[4] + vtrn.32 d26, d25 //Save to "d27, d26" + + vmov.i32 q0, q15 + vpaddl.u8 q0, q0 + vpaddl.u16 q0, q0 + vadd.u32 d2, d0, d1 //'m1' save to d2 + vrshr.u32 q0, q0, #2 //calculate 'm2','m3' + vrshr.u32 d2, d2, #3 //calculate 'm4' + + //duplicate the 'mx' to a vector line + vdup.8 d25, d2[0] + vdup.8 d24, d1[4] + vtrn.32 d25, d24 + + vdup.8 d24, d0[4] + vdup.8 d23, d2[4] + vtrn.32 d24, d23 //Save to "d25, d24" + + vmov.i32 q11, #0//Save the SATD of DC_BOTH + vmov.i32 q10, #0//Save the SATD of H + vmov.i32 q9 , #0//Save the SATD of V + sub r6, r0, #1 + sub r7, r4, #1 + mov lr, #4 +sad_intra_8x8_x3_opt_loop0: + + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r5], r3 + + //Get the left colume data to 'd0' (16 bytes) + vld1.8 {d2[]}, [r6], r1 + vld1.8 {d3[]}, [r7], r1 + + subs lr, #1 + + + //Do the SAD for top colume + vabal.u8 q11, d29, d0 + vabal.u8 q11, d31, d1 + + //Do the SAD for left colume + vabal.u8 q10, d2, d0 + vabal.u8 q10, d3, d1 + + //Do the SAD for mean value + vabal.u8 q9, d27, d0 + vabal.u8 q9, d25, d1 + + + bne sad_intra_8x8_x3_opt_loop0 + + mov lr, #4 +sad_intra_8x8_x3_opt_loop1: + + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r5], r3 + + //Get the left colume data to 'd0' (16 bytes) + vld1.8 {d2[]}, [r6], r1 + vld1.8 {d3[]}, [r7], r1 + + subs lr, #1 + + + //Do the SAD for top colume + vabal.u8 q11, d29, d0 + vabal.u8 q11, d31, d1 + + //Do the SAD for left colume + vabal.u8 q10, d2, d0 + vabal.u8 q10, d3, d1 + + //Do the SAD for mean value + vabal.u8 q9, d26, d0 + vabal.u8 q9, d24, d1 + + + bne sad_intra_8x8_x3_opt_loop1 + //Get the data from stack + ldr r5, [sp, #20] //the addr of Best_mode + ldr r6, [sp, #24] //the value of i_lambda + + vadd.u16 d22, d23 + vpaddl.u16 d22, d22 + vpaddl.u32 d22, d22 + vmov.u32 r0, d22[0] + add r0, r6, lsl #1 + + vadd.u16 d20, d21 + vpaddl.u16 d20, d20 + vpaddl.u32 d20, d20 + vmov.u32 r1, d20[0] + add r1, r6, lsl #1 + + vadd.u16 d18, d19 + vpaddl.u16 d18, d18 + vpaddl.u32 d18, d18 + vmov.u32 r2, d18[0] + + mov r4, #2 + cmp r1, r0 + movcc r0, r1 + movcc r4, #1 + cmp r2, r0 + movcc r0, r2 + movcc r4, #0 + + str r4, [r5] + + ldmia sp!, {r4-r7, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon + stmdb sp!, {r4-r7, lr} + + //Get the data from stack + ldr r4, [sp, #32] //p_dec_cr + ldr r5, [sp, #36] //p_enc_cr + + //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) + sub r6, r0, r1 + vld1.8 {d29}, [r6] + sub r6, r4, r1 + vld1.8 {d31}, [r6] + + //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) + sub r6, r0, #1 + GET_8BYTE_DATA_L0 d28, r6, r1 + sub r6, r4, #1 + GET_8BYTE_DATA_L0 d30, r6, r1 + + //Calculate the 16x16_v mode SATD and save to "q12, 13" + vshll.u8 q0, d29, #2 + vshll.u8 q1, d31, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q13, q2, q1 + vsub.s16 q12, q2, q1 + vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13 + //{8,9,11,10, 12,13,15,14} q12 + //Calculate the 16x16_h mode SATD and save to "q10, q11" + vshll.u8 q0, d28, #2 + vshll.u8 q1, d30, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q11, q2, q1 + vsub.s16 q10, q2, q1 + vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11 + //{8,9,11,10, 12,13,15,14} q10 + + //Calculate the sum of left column and top row + //vmov.i32 q0, q14 + vpaddl.u8 q0, q14 + vpaddl.u16 q0, q0 + vadd.u32 d2, d0, d1 + + vpaddl.u8 q2, q15 + vpaddl.u16 q2, q2 + vadd.u32 d3, d4, d5 + + vtrn.32 q0, q2 + vrshr.u32 q1, #3 + vrshr.u32 q2, #2 + vshll.u32 q9, d4, #4 // {2cb, 2cr} q9 + vshll.u32 q8, d5, #4 // {1cb, 1cr} q8 + vshll.u32 q7, d2, #4 // {0cb, 3cb} q7 + vshll.u32 q6, d3, #4 // {0cr, 3cr} q6 + + + vmov.i32 d28, #0//Save the SATD of DC_BOTH + vmov.i32 d10, #0//Save the SATD of H + vmov.i32 d11, #0//Save the SATD of V + vmov.i32 d30, #0//For zero D register + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {d6}, [r2], r3 + vld1.32 {d7}, [r2], r3 + vld1.32 {d8}, [r2], r3 + vld1.32 {d9}, [r2], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 + HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30 + HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30 + + vld1.32 {d6}, [r5], r3 + vld1.32 {d7}, [r5], r3 + vld1.32 {d8}, [r5], r3 + vld1.32 {d9}, [r5], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 + HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30 + HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30 + + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {d6}, [r2], r3 + vld1.32 {d7}, [r2], r3 + vld1.32 {d8}, [r2], r3 + vld1.32 {d9}, [r2], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 + HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30 + HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30 + + vld1.32 {d6}, [r5], r3 + vld1.32 {d7}, [r5], r3 + vld1.32 {d8}, [r5], r3 + vld1.32 {d9}, [r5], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 + HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30 + HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30 + + //Get the data from stack + ldr r5, [sp, #20] //the addr of Best_mode + ldr r6, [sp, #24] //the value of i_lambda + + vrshr.u16 d11, #1 + vpaddl.u16 d11, d11 + vpaddl.u32 d11, d11 + vmov.u32 lr, d11[0] + add lr, r6, lsl #1 + + vrshr.u16 d10, #1 + vpaddl.u16 d10, d10 + vpaddl.u32 d10, d10 + vmov.u32 r3, d10[0] + add r3, r6, lsl #1 + + vrshr.u16 d28, #1 + vpaddl.u16 d28, d28 + vpaddl.u32 d28, d28 + vmov.u32 r2, d28[0] + + mov r6, #2 + cmp r3, lr + movcc lr, r3 + movcc r6, #1 + cmp r2, lr + movcc lr, r2 + movcc r6, #0 + + str r6, [r5] + mov r0, lr + + ldmia sp!, {r4-r7, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon + stmdb sp!, {r4-r7, lr} + + //Get the top line data to 'd31[0~3]'(4 bytes) + sub r7, r0, r1 + vld1.32 {d31[0]}, [r7] + + //Get the left colume data to 'd31[4~7]' (4 bytes) + sub r7, r0, #1 + vld1.8 {d31[4]}, [r7], r1 + vld1.8 {d31[5]}, [r7], r1 + vld1.8 {d31[6]}, [r7], r1 + vld1.8 {d31[7]}, [r7], r1 + + //Calculate the mean value and save to 'd30' (2 bytes) + vpaddl.u8 d0, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + //Calculate the mean value + vrshr.u16 d0, #3 + vshl.u16 d30, d0, #4 + + //Calculate the 16x16_v mode SATD and save to "d29" + //Calculate the 16x16_h mode SATD and save to "d28" + vshll.u8 q0, d31, #2 + vtrn.32 d0, d1 + vadd.s16 d2, d0, d1 + vsub.s16 d1, d0, d1 + vtrn.16 d2, d1 + vadd.s16 d29, d2, d1 + vsub.s16 d28, d2, d1 + vtrn.32 d29, d28 //{0,1,3,2 top} d29 + //{0,1,3,2 left} d28 + + vmov.i32 d27, #0//Save the SATD of DC_BOTH + vmov.i32 d26, #0//Save the SATD of H + vmov.i32 d25, #0//Save the SATD of V + vmov.i32 d24, #0//For zero D register + + //Load the p_enc data and save to "d22,d23"--- 4X4 bytes + vld1.32 {d23[0]}, [r2], r3 + vld1.32 {d23[1]}, [r2], r3 + vld1.32 {d22[0]}, [r2], r3 + vld1.32 {d22[1]}, [r2], r3 + + HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24 + + //Get the data from stack + ldr r5, [sp, #28] //the value of lambda2 + ldr r6, [sp, #32] //the value of lambda1 + ldr r7, [sp, #36] //the value of lambda0 + + vrshr.u16 d25, #1 + vpaddl.u16 d25, d25 + vpaddl.u32 d25, d25 + vmov.u32 r0, d25[0] + add r0, r7 + + vrshr.u16 d26, #1 + vpaddl.u16 d26, d26 + vpaddl.u32 d26, d26 + vmov.u32 r1, d26[0] + add r1, r6 + + vrshr.u16 d27, #1 + vpaddl.u16 d27, d27 + vpaddl.u32 d27, d27 + vmov.u32 r2, d27[0] + add r2, r5 + + ldr r5, [sp, #20] //p_dst + ldr r6, [sp, #24] //the addr of Best_mode + + mov r4, r0 + cmp r1, r4 + movcc r4, r1 + cmp r2, r4 + movcc r4, r2 + + //The compare sequence affect the resule + cmp r4, r2 + bne satd_intra_4x4_x3_opt_jump0 + mov r0, #2 + str r0, [r6] + vshr.u32 d0, d30, #4 // {2cb, 2cr} q9 + vdup.8 q1, d0[0] + vst1.8 {q1}, [r5] + //... + bl satd_intra_4x4_x3_opt_end +satd_intra_4x4_x3_opt_jump0: + + cmp r4, r1 + bne satd_intra_4x4_x3_opt_jump1 + mov r0, #1 + str r0, [r6] + vdup.8 d0, d31[4] + vdup.8 d1, d31[5] + vdup.8 d2, d31[6] + vdup.8 d3, d31[7] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5] + + bl satd_intra_4x4_x3_opt_end +satd_intra_4x4_x3_opt_jump1: + + mov r0, #0 + str r0, [r6] + vst1.32 {d31[0]}, [r5]! + vst1.32 {d31[0]}, [r5]! + vst1.32 {d31[0]}, [r5]! + vst1.32 {d31[0]}, [r5]! + + +satd_intra_4x4_x3_opt_end: + mov r0, r4 + + ldmia sp!, {r4-r7, lr} +WELS_ASM_FUNC_END + +#endif \ No newline at end of file diff --git a/codec/encoder/core/arm/mc_neon.S b/codec/encoder/core/arm/mc_neon.S new file mode 100755 index 00000000..c81940d8 --- /dev/null +++ b/codec/encoder/core/arm/mc_neon.S @@ -0,0 +1,1963 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + +#ifdef APPLE_IOS +.macro AVERAGE_TWO_8BITS +// { // input:dst_d, src_d A and B; working: q13 + vaddl.u8 q13, $2, $1 + vrshrn.u16 $0, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 +// } +.endm + +.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}, + vrev64.8 $2, $0 // X[5][4][3][2][1][0]O + vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 $0, $0, $0 + vpadd.s16 $0, $0, $0 + vqrshrun.s16 $0, $4, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 + vaddl.u8 q13, $2, $6 + vrshrn.u16 $6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 + vaddl.u8 q13, $3, $6 + vrshrn.u16 $6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, + vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS +// { // input:a, b, c, dst_d; + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $3, $0, #6 //(+32)>>6 +// } +.endm + +.macro UNPACK_2_16BITS_TO_ABC +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a + vext.16 $4, $0, $1, #2 //src[0] + vext.16 $3, $0, $1, #3 //src[1] + vadd.s16 $4, $3 //c=src[0]+src[1] + + vext.16 $3, $0, $1, #1 //src[-1] + vext.16 $2, $0, $1, #4 //src[2] + vadd.s16 $3, $2 //b=src[-1]+src[2] + + vext.16 $2, $0, $1, #5 //src[3] + vadd.s16 $2, $0 //a=src[-2]+src[3] +// } +.endm + +.macro UNPACK_1_IN_8x16BITS_TO_8BITS +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5], + vrev64.16 $1, $1 + vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5], + vshr.s64 $1, $2, #16 + vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0 + + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $0, $3, #6 //(+32)>>6 +// } +.endm +#else +.macro AVERAGE_TWO_8BITS arg0, arg1,arg2 +// { // input:dst_d, src_d A and B; working: q13 + vaddl.u8 q13, \arg2, \arg1 + vrshrn.u16 \arg0, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 +// } +.endm + +.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2} + vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O + vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 \arg0, \arg0, \arg0 + vpadd.s16 \arg0, \arg0, \arg0 + vqrshrun.s16 \arg0, \arg4, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 + vaddl.u8 q13, \arg2, \arg6 + vrshrn.u16 \arg6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 + vaddl.u8 q13, \arg3, \arg6 + vrshrn.u16 \arg6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3] + vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3 +// { // input:a, b, c, dst_d; + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6 +// } +.endm + +.macro UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4 +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5) + vext.16 \arg4, \arg0, \arg1, #2 //src[0] + vext.16 \arg3, \arg0, \arg1, #3 //src[1] + vadd.s16 \arg4, \arg3 //c=src[0]+src[1] + + vext.16 \arg3, \arg0, \arg1, #1 //src[-1] + vext.16 \arg2, \arg0, \arg1, #4 //src[2] + vadd.s16 \arg3, \arg2 //b=src[-1]+src[2] + + vext.16 \arg2, \arg0, \arg1, #5 //src[3] + vadd.s16 \arg2, \arg0 //a=src[-2]+src[3] +// } +.endm + +.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3 +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5] + vrev64.16 \arg1, \arg1 + vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5] + vshr.s64 \arg1, \arg2, #16 + vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0 + + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6 +// } +.endm +#endif + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w16_h_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15 + + FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15 + + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + + cmp r4, #0 + bne w16_h_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d3, r5, r4 // 0x0014FFFB00010000 + + sub r3, #16 + ldr r4, [sp, #8] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w17_h_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15 + + FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15 + + vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte + + vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1 + + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + sub r4, #1 + cmp r4, #0 + bne w17_h_mc_luma_loop + pop {r4-r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d7, r5, r4 // 0x0014FFFB00010000 + + sub r3, #8 + ldr r4, [sp, #8] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w9_h_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15 + + sub r4, #1 + vst1.u8 {d8}, [r2]! //write [0:7] Byte + + vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1 + vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte + + cmp r4, #0 + bne w9_h_mc_luma_loop + pop {r4-r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon + push {r4, r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w4_h_mc_luma_loop: + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] + + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q4, q2, q2, #2 //src[1:6 * *] + + vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + + FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15 + + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_h_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w16_xy_10_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15 + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15 + + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + + cmp r4, #0 + bne w16_xy_10_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w8_xy_10_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15 + + sub r4, #1 + vst1.u8 {d1}, [r2], r3 + + cmp r4, #0 + bne w8_xy_10_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon + push {r4, r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w4_xy_10_mc_luma_loop: + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] + + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q4, q2, q2, #2 //src[1:6 * *] + + vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15 + + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_xy_10_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w16_xy_30_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15 + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15 + + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + + cmp r4, #0 + bne w16_xy_30_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w8_xy_30_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15 + + sub r4, #1 + vst1.u8 {d1}, [r2], r3 + + cmp r4, #0 + bne w8_xy_30_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon + push {r4, r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w4_xy_30_mc_luma_loop: + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] + + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q4, q2, q2, #2 //src[1:6 * *] + + vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15 + + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_xy_30_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w16_xy_01_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #0 + bne w16_xy_01_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w8_xy_01_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #0 + bne w8_xy_01_mc_luma_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon + push {r4, r5, r6, r7} + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] + + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 + + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] + +w4_xy_01_mc_luma_loop: + + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vmov r4, r5, d12 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte + + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vmov r5, r6, d12 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte + + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 + + sub r7, #4 + cmp r7, #0 + bne w4_xy_01_mc_luma_loop + + pop {r4, r5, r6, r7} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w16_xy_03_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #0 + bne w16_xy_03_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w8_xy_03_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #0 + bne w8_xy_03_mc_luma_loop + + pop {r4} + WELS_ASM_FUNC_END + + WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon + push {r4, r5, r6, r7} + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] + + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 + + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] + +w4_xy_03_mc_luma_loop: + + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vmov r4, r5, d12 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte + + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vmov r5, r6, d12 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte + + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 + + sub r7, #4 + cmp r7, #0 + bne w4_xy_03_mc_luma_loop + + pop {r4, r5, r6, r7} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w16_v_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #0 + bne w16_v_mc_luma_loop + pop {r4} + WELS_ASM_FUNC_END + + WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w17_v_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #1 + bne w17_v_mc_luma_loop + // the last 16Bytes + vld1.u8 {q5}, [r0], r1 //q5=src[3] + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w9_v_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #1 + bne w9_v_mc_luma_loop + + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vst1.u8 {d12}, [r2], r3 //write last 8Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon + push {r4, r5, r6, r7} + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] + + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 + + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] + +w4_v_mc_luma_loop: + +// pld [r0] + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 + + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vmov r4, r5, d12 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte + + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 + + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 + vmov r5, r6, d12 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte + + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 + + sub r7, #4 + cmp r7, #0 + bne w4_v_mc_luma_loop + + pop {r4, r5, r6, r7} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2] + +w16_hv_mc_luma_loop: + + vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2], r3 //write 16Byte + + + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + vst1.u8 {d3, d4}, [r2], r3 //write 16Byte + + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2], r3 //write 16Byte + + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2], r3 //write 16Byte + + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 + + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 + + sub r4, #4 + cmp r4, #0 + bne w16_hv_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2] + sub r3, #16 + +w17_hv_mc_luma_loop: + + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {d0, d1}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + vst1.u8 {d3, d4}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0] + vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0] + vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2], r3 //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0] + vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte + + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 + + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 + + sub r4, #4 + cmp r4, #1 + bne w17_hv_mc_luma_loop + //the last row + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2] + sub r3, #8 + +w9_hv_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q0}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q1}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q2}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + //q4~q5, q0~q2, --> q0~q4 + vswp q0, q4 + vswp q2, q4 + vmov q3, q1 + vmov q1, q5 + + sub r4, #4 + cmp r4, #1 + bne w9_hv_mc_luma_loop + //the last row + vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon + push {r4 ,r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2] + +w4_hv_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3] + vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4] + + //the 1st&2nd row + pld [r0] + pld [r0, r1] + // vertical filtered + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail + + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail + UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail + + vmov d23, d0 + vmov d25, d14 + vmov d27, d16 + + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] + vmov r4 ,r5, d22 + str r4, [r2], r3 //write 4Byte + str r5, [r2], r3 //write 4Byte + + //the 3rd&4th row + vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3] + vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4] + pld [r0] + pld [r0, r1] + // vertical filtered + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail + + FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail + UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail + + vmov d23, d4 + vmov d25, d14 + vmov d27, d16 + + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] + vmov r4 ,r5, d22 + str r4, [r2], r3 //write 4Byte + str r5, [r2], r3 //write 4Byte + + //q4~q6, q0~q1, --> q0~q4 + vswp q4, q0 + vmov q3, q4 + vmov q4, q1 + vmov q1, q5 + vmov q2, q6 + + sub r6, #4 + cmp r6, #0 + bne w4_hv_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon + push {r4} + ldr r4, [sp, #4] +w16_copy_loop: + vld1.u8 {q0}, [r0], r1 + vld1.u8 {q1}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + vst1.u8 {q1}, [r2], r3 + sub r4, #2 + cmp r4, #0 + bne w16_copy_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon + push {r4} + ldr r4, [sp, #4] +w8_copy_loop: + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vst1.u8 {d1}, [r2], r3 + sub r4, #2 + cmp r4, #0 + bne w8_copy_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon + push {r4, r5, r6} + ldr r4, [sp, #12] +w4_copy_loop: + ldr r5, [r0], r1 + ldr r6, [r0], r1 + str r5, [r2], r3 + str r6, [r2], r3 + + sub r4, #2 + cmp r4, #0 + bne w4_copy_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon + push {r4} + ldr r4, [sp, #4] +w16_pix_avg_loop: + vld1.u8 {q0}, [r2]! + vld1.u8 {q1}, [r3]! + vld1.u8 {q2}, [r2]! + vld1.u8 {q3}, [r3]! + + vld1.u8 {q4}, [r2]! + vld1.u8 {q5}, [r3]! + vld1.u8 {q6}, [r2]! + vld1.u8 {q7}, [r3]! + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 + + AVERAGE_TWO_8BITS d8, d8, d10 + AVERAGE_TWO_8BITS d9, d9, d11 + vst1.u8 {q4}, [r0], r1 + + AVERAGE_TWO_8BITS d12, d12, d14 + AVERAGE_TWO_8BITS d13, d13, d15 + vst1.u8 {q6}, [r0], r1 + + sub r4, #4 + cmp r4, #0 + bne w16_pix_avg_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] + +enc_w16_pix_avg_loop: + vld1.u8 {q0}, [r2], r3 + vld1.u8 {q1}, [r4], r5 + vld1.u8 {q2}, [r2], r3 + vld1.u8 {q3}, [r4], r5 + + vld1.u8 {q4}, [r2], r3 + vld1.u8 {q5}, [r4], r5 + vld1.u8 {q6}, [r2], r3 + vld1.u8 {q7}, [r4], r5 + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 + + AVERAGE_TWO_8BITS d8, d8, d10 + AVERAGE_TWO_8BITS d9, d9, d11 + vst1.u8 {q4}, [r0], r1 + + AVERAGE_TWO_8BITS d12, d12, d14 + AVERAGE_TWO_8BITS d13, d13, d15 + vst1.u8 {q6}, [r0], r1 + + sub r6, #4 + cmp r6, #0 + bne enc_w16_pix_avg_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] +enc_w8_pix_avg_loop: + + vld1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r4], r5 + vld1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r4], r5 + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {d0}, [r0], r1 + vst1.u8 {d1}, [r0], r1 + + vld1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r4], r5 + vld1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r4], r5 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {d4}, [r0], r1 + vst1.u8 {d5}, [r0], r1 + + sub r6, #4 + cmp r6, #0 + bne enc_w8_pix_avg_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon + + push {r4, r5} + ldr r4, [sp, #8] + ldr r5, [sp, #12] + vld1.u8 {d31}, [r4] //load A/B/C/D + vld1.u8 {q0}, [r0], r1 //src[x] + + vdup.u8 d28, d31[0] //A + vdup.u8 d29, d31[1] //B + vdup.u8 d30, d31[2] //C + vdup.u8 d31, d31[3] //D + + vext.u8 d1, d0, d1, #1 //src[x+1] + +w8_mc_chroma_loop: // each two pxl row + vld1.u8 {q1}, [r0], r1 //src[x+stride] + vld1.u8 {q2}, [r0], r1 //src[x+2*stride] + vext.u8 d3, d2, d3, #1 //src[x+stride+1] + vext.u8 d5, d4, d5, #1 //src[x+2*stride+1] + + vmull.u8 q3, d0, d28 //(src[x] * A) + vmlal.u8 q3, d1, d29 //+=(src[x+1] * B) + vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C) + vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D) + vrshrn.u16 d6, q3, #6 + vst1.u8 d6, [r2], r3 + + vmull.u8 q3, d2, d28 //(src[x] * A) + vmlal.u8 q3, d3, d29 //+=(src[x+1] * B) + vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C) + vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D) + vrshrn.u16 d6, q3, #6 + vst1.u8 d6, [r2], r3 + + vmov q0, q2 + sub r5, #2 + cmp r5, #0 + bne w8_mc_chroma_loop + + pop {r4, r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon + + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r6, [sp, #16] + vld1.u8 {d31}, [r4] //load A/B/C/D + + vdup.u8 d28, d31[0] //A + vdup.u8 d29, d31[1] //B + vdup.u8 d30, d31[2] //C + vdup.u8 d31, d31[3] //D + +w4_mc_chroma_loop: // each two pxl row + vld1.u8 {d0}, [r0], r1 //a::src[x] + vld1.u8 {d2}, [r0], r1 //b::src[x+stride] + vld1.u8 {d4}, [r0] //c::src[x+2*stride] + + vshr.u64 d1, d0, #8 + vshr.u64 d3, d2, #8 + vshr.u64 d5, d4, #8 + + vmov q3, q1 //b::[0:7]+b::[1~8] + vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]} + vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]} + + vmull.u8 q1, d0, d28 //(src[x] * A) + vmlal.u8 q1, d1, d29 //+=(src[x+1] * B) + vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C) + vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D) + + vrshrn.u16 d2, q1, #6 + vmov r4, r5, d2 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_mc_chroma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END +#endif diff --git a/codec/encoder/core/arm/memory_neon.S b/codec/encoder/core/arm/memory_neon.S new file mode 100755 index 00000000..7b65d490 --- /dev/null +++ b/codec/encoder/core/arm/memory_neon.S @@ -0,0 +1,63 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + + +WELS_ASM_FUNC_BEGIN WelsSetMemZero_neon + veor q0, q0 + cmp r1, #32 + beq mem_zero_32_neon_start + blt mem_zero_24_neon_start + +mem_zero_loop: + subs r1, r1, #64 + vst1.64 {q0}, [r0]! + vst1.64 {q0}, [r0]! + vst1.64 {q0}, [r0]! + vst1.64 {q0}, [r0]! + bne mem_zero_loop +WELS_ASM_FUNC_END + +mem_zero_32_neon_start: + vst1.64 {q0}, [r0]! + vst1.64 {q0}, [r0]! +WELS_ASM_FUNC_END + +mem_zero_24_neon_start: + vst1.64 {q0}, [r0]! + vst1.64 {d0}, [r0]! +WELS_ASM_FUNC_END + +#endif \ No newline at end of file diff --git a/codec/encoder/core/arm/pixel_neon.S b/codec/encoder/core/arm/pixel_neon.S new file mode 100755 index 00000000..792fba34 --- /dev/null +++ b/codec/encoder/core/arm/pixel_neon.S @@ -0,0 +1,880 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + +.macro SATD_16x4 + vld1.64 {q0}, [r0,:128], r1 + vld1.64 {q1}, [r2], r3 + + vsubl.u8 q4, d0, d2 + vld1.64 {q2}, [r0,:128], r1 + + vsubl.u8 q6, d1, d3 + vld1.64 {q3}, [r2], r3 + + vsubl.u8 q5, d4, d6 + vld1.64 {q0}, [r0,:128], r1 + + vsubl.u8 q7, d5, d7 + vld1.64 {q1}, [r2], r3 + + vsubl.u8 q8, d0, d2 + vld1.64 {q2}, [r0,:128], r1 + + vsubl.u8 q10, d1, d3 + vadd.s16 q0, q4, q5 + + vld1.64 {q3}, [r2], r3 + vsub.s16 q1, q4, q5 + + vsubl.u8 q9, d4, d6 + vsubl.u8 q11, d5, d7 + + vadd.s16 q2, q8, q9 + vsub.s16 q3, q8, q9 + + vadd.s16 q4, q6, q7 + vsub.s16 q5, q6, q7 + + vadd.s16 q6, q10, q11 + vsub.s16 q7, q10, q11 + + vadd.s16 q8, q0, q2 + vsub.s16 q10, q0, q2 + + vadd.s16 q9, q4, q6 + vsub.s16 q11, q4, q6 + + vsub.s16 q0, q1, q3 + vadd.s16 q2, q1, q3 + + vsub.s16 q1, q5, q7 + vadd.s16 q3, q5, q7 + + vtrn.16 q8, q10 + vtrn.16 q9, q11 + + vadd.s16 q4, q8, q10 + vabd.s16 q6, q8, q10 + + vadd.s16 q5, q9, q11 + vabd.s16 q7, q9, q11 + + vabs.s16 q4, q4 + vabs.s16 q5, q5 + + vtrn.16 q0, q2 + vtrn.16 q1, q3 + + vadd.s16 q8, q0, q2 + vabd.s16 q10, q0, q2 + + vadd.s16 q9, q1, q3 + vabd.s16 q11, q1, q3 + + vabs.s16 q8, q8 + vabs.s16 q9, q9 + + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.32 q8, q10 + vtrn.32 q9, q11 + + vmax.s16 q0, q4, q6 + vmax.s16 q1, q5, q7 + vmax.s16 q2, q8, q10 + vmax.s16 q3, q9, q11 + + vadd.u16 q0, q0, q1 + vadd.u16 q2, q2, q3 +.endm + +.macro SATD_8x4 + + vld1.64 {d0}, [r0,:64], r1 + vld1.64 {d1}, [r2], r3 + + vld1.64 {d2}, [r0,:64], r1 + vsubl.u8 q4, d0, d1 + + vld1.64 {d3}, [r2], r3 + vsubl.u8 q5, d2, d3 + + vld1.64 {d4}, [r0,:64], r1 + vld1.64 {d5}, [r2], r3 + + vadd.s16 q8, q4, q5 + vsubl.u8 q6, d4, d5 + + vld1.64 {d6}, [r0,:64], r1 + vld1.64 {d7}, [r2], r3 + + vsubl.u8 q7, d6, d7 + vsub.s16 q9, q4, q5 + + vadd.s16 q10, q6, q7 + vsub.s16 q11, q6, q7 + + vadd.s16 q0, q8, q10 + vsub.s16 q1, q8, q10 + + vsub.s16 q2, q9, q11 + vadd.s16 q3, q9, q11 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + + vadd.s16 q4, q0, q1 + vabd.s16 q5, q0, q1 + + vabs.s16 q4, q4 + vadd.s16 q6, q2, q3 + + vabd.s16 q7, q2, q3 + vabs.s16 q6, q6 + + vtrn.32 q4, q5 + vtrn.32 q6, q7 + + vmax.s16 q0, q4, q5 + vmax.s16 q1, q6, q7 +.endm + +.macro SAD_16x4 + vld1.64 {q6}, [r0, :128], r1 + vabal.u8 q10, d8, d10 + + vld1.64 {q7}, [r2], r3 + vabal.u8 q11, d9, d11 + + vld1.64 {q0}, [r0, :128], r1 + vabal.u8 q12, d12, d14 + + vld1.64 {q1}, [r2], r3 + vabal.u8 q13, d13, d15 + + vld1.64 {q2}, [r0, :128], r1 + vabal.u8 q10, d0, d2 + + vld1.64 {q3}, [r2], r3 + vabal.u8 q11, d1, d3 + + vld1.64 {q4}, [r0, :128], r1 + vabal.u8 q12, d4, d6 + + vld1.64 {q5}, [r2], r3 + vabal.u8 q13, d5, d7 +.endm + +.macro SAD_8x4 + vld1.64 {d0}, [r0, :64], r1 + vld1.64 {d1}, [r2], r3 + + vabal.u8 q10, d0, d1 + vld1.64 {d2}, [r0, :64], r1 + + vld1.64 {d3}, [r2], r3 + vabal.u8 q11, d2, d3 + + vld1.64 {d4}, [r0, :64], r1 + vld1.64 {d5}, [r2], r3 + + vabal.u8 q12, d4, d5 + vld1.64 {d6}, [r0, :64], r1 + + vld1.64 {d7}, [r2], r3 + vabal.u8 q13, d6, d7 +.endm + + +WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon + + vld1.64 {q0}, [r0, :128], r1 + vld1.64 {q1}, [r2], r3 + + vabdl.u8 q10, d0, d2 + vld1.64 {q2}, [r0, :128], r1 + + vabdl.u8 q11, d1, d3 + vld1.64 {q3}, [r2], r3 + + vld1.64 {q4}, [r0, :128], r1 + vabdl.u8 q12, d4, d6 + vld1.64 {q5}, [r2], r3 + vabdl.u8 q13, d5, d7 + + SAD_16x4 + SAD_16x4 + SAD_16x4 + + vld1.64 {q6}, [r0, :128], r1 + vabal.u8 q10, d8, d10 + + vld1.64 {q7}, [r2], r3 + vabal.u8 q11, d9, d11 + + vabal.u8 q12, d12, d14 + vabal.u8 q13, d13, d15 + + vadd.u16 q14, q10, q11 + vadd.u16 q15, q12, q13 + + vadd.u16 q15, q14, q15 + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vmov.u32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon + + vld1.64 {q0}, [r0, :128], r1 + vld1.64 {q1}, [r2], r3 + + vabdl.u8 q10, d0, d2 + vld1.64 {q2}, [r0, :128], r1 + + vabdl.u8 q11, d1, d3 + vld1.64 {q3}, [r2], r3 + + vld1.64 {q4}, [r0, :128], r1 + vabdl.u8 q12, d4, d6 + vld1.64 {q5}, [r2], r3 + vabdl.u8 q13, d5, d7 + + SAD_16x4 + + vld1.64 {q6}, [r0, :128], r1 + vabal.u8 q10, d8, d10 + + vld1.64 {q7}, [r2], r3 + vabal.u8 q11, d9, d11 + + vabal.u8 q12, d12, d14 + vabal.u8 q13, d13, d15 + + vadd.u16 q14, q10, q11 + vadd.u16 q15, q12, q13 + + vadd.u16 q15, q14, q15 + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vmov.u32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon + + vld1.64 {d0}, [r0, :64], r1 + vld1.64 {d1}, [r2], r3 + + vabdl.u8 q10, d0, d1 + vld1.64 {d2}, [r0, :64], r1 + + vld1.64 {d3}, [r2], r3 + vabdl.u8 q11, d2, d3 + + vld1.64 {d4}, [r0, :64], r1 + vld1.64 {d5}, [r2], r3 + + vabdl.u8 q12, d4, d5 + vld1.64 {d6}, [r0, :64], r1 + + vld1.64 {d7}, [r2], r3 + vabdl.u8 q13, d6, d7 + + SAD_8x4 + SAD_8x4 + SAD_8x4 + + vadd.u16 q14, q10, q11 + vadd.u16 q15, q12, q13 + vadd.u16 q15, q15, q14 + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vmov.u32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon + + vld1.64 {d0}, [r0, :64], r1 + vld1.64 {d1}, [r2], r3 + + vabdl.u8 q10, d0, d1 + vld1.64 {d2}, [r0, :64], r1 + + vld1.64 {d3}, [r2], r3 + vabdl.u8 q11, d2, d3 + + vld1.64 {d4}, [r0, :64], r1 + vld1.64 {d5}, [r2], r3 + + vabdl.u8 q12, d4, d5 + vld1.64 {d6}, [r0, :64], r1 + + vld1.64 {d7}, [r2], r3 + vabdl.u8 q13, d6, d7 + + SAD_8x4 + + vadd.u16 q14, q10, q11 + vadd.u16 q15, q12, q13 + vadd.u16 q15, q15, q14 + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vmov.u32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon + stmdb sp!, {r4-r5, lr} + + //Loading a horizontal line data (4 bytes) + //line 0 + ldr r4, [r0], r1 + ldr r5, [r2], r3 + usad8 lr, r4, r5 + + //line 1 + ldr r4, [r0], r1 + ldr r5, [r2], r3 + usada8 lr, r4, r5, lr + + //line 2 + ldr r4, [r0], r1 + ldr r5, [r2], r3 + usada8 lr, r4, r5, lr + + //line 3 + ldr r4, [r0] + ldr r5, [r2] + usada8 r0, r4, r5, lr + + ldmia sp!, {r4-r5, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon + + stmdb sp!, {r4-r5, lr} + + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 + + //Loading a horizontal line data (16 bytes) + vld1.8 {q0}, [r0], r1 //save pix1 + + vld1.8 {q1}, [r2], r3 //save pix2 - stride + vld1.8 {q6}, [r2], r3 //save pix2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride + + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vld1.8 {q4}, [r5], r3 //save pix2 + 1 + + //Do the SAD for 16 bytes + vabdl.u8 q15, d0, d2 + vabal.u8 q15, d1, d3 + + vabdl.u8 q13, d0, d4 + vabal.u8 q13, d1, d5 + + vabdl.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 + + vabdl.u8 q9, d0, d8 + vabal.u8 q9, d1, d9 + + mov lr, #15 +pixel_sad_4_16x16_loop_0: + + //Loading a horizontal line data (16 bytes) + vld1.8 {q0}, [r0], r1 //save pix1 + vmov.8 q1, q6 //save pix2 - stride + vmov.8 q6, q2 + vabal.u8 q15, d0, d2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride + vabal.u8 q15, d1, d3 + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vabal.u8 q13, d0, d4 + vld1.8 {q4}, [r5], r3 //save pix2 + 1 + vabal.u8 q13, d1, d5 + subs lr, #1 + + vabal.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 + + vabal.u8 q9, d0, d8 + vabal.u8 q9, d1, d9 + + bne pixel_sad_4_16x16_loop_0 + + + //Save SAD to 'r0' + ldr r0, [sp, #12] + + vadd.u16 d0, d30, d31 + vadd.u16 d1, d26, d27 + vadd.u16 d2, d22, d23 + vadd.u16 d3, d18, d19 + + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 + + vshl.u32 q0, #4 + vshl.u32 q1, #4 + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + + ldmia sp!, {r4-r5, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon + stmdb sp!, {r4-r5, lr} + + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 + + //Loading a horizontal line data (16 bytes) + vld1.8 {q0}, [r0], r1 //save pix1 + + vld1.8 {q1}, [r2], r3 //save pix2 - stride + vld1.8 {q6}, [r2], r3 //save pix2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride + + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vld1.8 {q4}, [r5], r3 //save pix2 + 1 + + //Do the SAD for 16 bytes + vabdl.u8 q15, d0, d2 + vabal.u8 q15, d1, d3 + + vabdl.u8 q13, d0, d4 + vabal.u8 q13, d1, d5 + + vabdl.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 + + vabdl.u8 q9, d0, d8 + vabal.u8 q9, d1, d9 + + mov lr, #7 +pixel_sad_4_16x8_loop_0: + + //Loading a horizontal line data (16 bytes) + vld1.8 {q0}, [r0], r1 //save pix1 + vmov.8 q1, q6 //save pix2 - stride + vmov.8 q6, q2 + vabal.u8 q15, d0, d2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride + vabal.u8 q15, d1, d3 + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vabal.u8 q13, d0, d4 + vld1.8 {q4}, [r5], r3 //save pix2 + 1 + vabal.u8 q13, d1, d5 + subs lr, #1 + + vabal.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 + + vabal.u8 q9, d0, d8 + vabal.u8 q9, d1, d9 + + bne pixel_sad_4_16x8_loop_0 + + //Save SAD to 'r0' + ldr r0, [sp, #12] + + vadd.u16 d0, d30, d31 + vadd.u16 d1, d26, d27 + vadd.u16 d2, d22, d23 + vadd.u16 d3, d18, d19 + + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 + + vshl.u32 q0, #4 + vshl.u32 q1, #4 + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + + ldmia sp!, {r4-r5, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon + stmdb sp!, {r4-r5, lr} + + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 + + //Loading a horizontal line data (8 bytes) + vld1.8 {d0}, [r0], r1 //save pix1 + + vld1.8 {d1}, [r2], r3 //save pix2 - stride + vld1.8 {d6}, [r2], r3 //save pix2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride + + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + + //Do the SAD for 8 bytes + vabdl.u8 q15, d0, d1 + vabdl.u8 q14, d0, d2 + vabdl.u8 q13, d0, d3 + vabdl.u8 q12, d0, d4 + + mov lr, #15 +pixel_sad_4_8x16_loop_0: + + //Loading a horizontal line data (8 bytes) + vld1.8 {d0}, [r0], r1 //save pix1 + vmov.8 d1, d6 //save pix2 - stride + vmov.8 d6, d2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vabal.u8 q15, d0, d1 + + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 8 bytes + vabal.u8 q14, d0, d2 + vabal.u8 q13, d0, d3 + vabal.u8 q12, d0, d4 + subs lr, #1 + + bne pixel_sad_4_8x16_loop_0 + + //Save SAD to 'r0' + ldr r0, [sp, #12] + + vadd.u16 d0, d30, d31 + vadd.u16 d1, d28, d29 + vadd.u16 d2, d26, d27 + vadd.u16 d3, d24, d25 + + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 + + vshl.u32 q0, #4 + vshl.u32 q1, #4 + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + + ldmia sp!, {r4-r5, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon + stmdb sp!, {r4-r5, lr} + + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 + + //Loading a horizontal line data (8 bytes) + vld1.8 {d0}, [r0], r1 //save pix1 + + vld1.8 {d1}, [r2], r3 //save pix2 - stride + vld1.8 {d6}, [r2], r3 //save pix2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride + + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + + //Do the SAD for 8 bytes + vabdl.u8 q15, d0, d1 + vabdl.u8 q14, d0, d2 + vabdl.u8 q13, d0, d3 + vabdl.u8 q12, d0, d4 + + mov lr, #7 +pixel_sad_4_8x8_loop_0: + + //Loading a horizontal line data (8 bytes) + vld1.8 {d0}, [r0], r1 //save pix1 + vmov.8 d1, d6 //save pix2 - stride + vmov.8 d6, d2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vabal.u8 q15, d0, d1 + + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 8 bytes + vabal.u8 q14, d0, d2 + vabal.u8 q13, d0, d3 + vabal.u8 q12, d0, d4 + subs lr, #1 + bne pixel_sad_4_8x8_loop_0 + + //Save SAD to 'r0' + ldr r0, [sp, #12] + + vadd.u16 d0, d30, d31 + vadd.u16 d1, d28, d29 + vadd.u16 d2, d26, d27 + vadd.u16 d3, d24, d25 + + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 + + vshl.u32 q0, #4 + vshl.u32 q1, #4 + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + + ldmia sp!, {r4-r5, lr} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon + + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d1[0]}, [r0], r1 + vld1.32 {d1[1]}, [r0] + + + sub r0, r2, r3 + vld1.32 {d2[0]}, [r0], r3 + vld1.32 {d2[1]}, [r0], r3 + vld1.32 {d3[0]}, [r0], r3 + vld1.32 {d3[1]}, [r0], r3 + vld1.32 {d4[0]}, [r0], r3 + vld1.32 {d4[1]}, [r0] + + sub r0, r2, #1 + vld1.32 {d5[0]}, [r0], r3 + vld1.32 {d5[1]}, [r0], r3 + vld1.32 {d6[0]}, [r0], r3 + vld1.32 {d6[1]}, [r0] + + add r0, r2, #1 + vld1.32 {d7[0]}, [r0], r3 + vld1.32 {d7[1]}, [r0], r3 + vld1.32 {d8[0]}, [r0], r3 + vld1.32 {d8[1]}, [r0] + + vabdl.u8 q15, d0, d2 + vabdl.u8 q14, d1, d3 + + vabdl.u8 q13, d0, d3 + vabdl.u8 q12, d1, d4 + + vabdl.u8 q11, d0, d5 + vabdl.u8 q10, d1, d6 + + vabdl.u8 q9, d0, d7 + vabdl.u8 q8, d1, d8 + + //Save SAD to 'r4' + ldr r0, [sp] + vadd.u16 q0, q14, q15 + vadd.u16 q1, q12, q13 + vadd.u16 q2, q10, q11 + vadd.u16 q3, q8 , q9 + + vadd.u16 d0, d1 + vadd.u16 d1, d2, d3 + vadd.u16 d2, d4, d5 + vadd.u16 d3, d6, d7 + + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 + + vshl.u32 q0, #4 + vshl.u32 q1, #4 + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon + + SATD_16x4 + vadd.u16 q15, q0, q2 + + SATD_16x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q2 + + SATD_16x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q2 + + SATD_16x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q2 + + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vmov.32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon + + SATD_16x4 + vadd.u16 q15, q0, q2 + + SATD_16x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q2 + + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vmov.32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon + + SATD_8x4 + vadd.u16 q15, q0, q1 + + SATD_8x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q1 + + SATD_8x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q1 + + SATD_8x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q1 + + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vmov.32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon + + SATD_8x4 + vadd.u16 q15, q0, q1 + + SATD_8x4 + vadd.u16 q15, q15, q0 + vadd.u16 q15, q15, q1 + + vadd.u16 d0, d30, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vmov.32 r0, d0[0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon + + //Load the pix1 data --- 16 bytes + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d1[0]}, [r0], r1 + vld1.32 {d1[1]}, [r0] + + //Load the pix2 data --- 16 bytes + vld1.32 {d2[0]}, [r2], r3 + vld1.32 {d2[1]}, [r2], r3 + vld1.32 {d3[0]}, [r2], r3 + vld1.32 {d3[1]}, [r2] + + //Get the difference + vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7} + vsubl.u8 q14, d1, d3 //{8,9,10,11,12,13,14,15} + + //Do the vertical transform + vadd.s16 q13, q15, q14 //{0,4,8,12,1,5,9,13} + vsub.s16 q12, q15, q14 //{2,6,10,14,3,7,11,15} + vswp d27, d24 + vadd.s16 q15, q13, q12 //{0,1,2,3,4,5,6,7} + vsub.s16 q14, q13, q12 //{12,13,14,15,8,9,10,11} + + //Do the horizontal transform + vtrn.32 q15, q14 + vadd.s16 q13, q15, q14 + vsub.s16 q12, q15, q14 + + vtrn.16 q13, q12 + vadd.s16 q15, q13, q12 + + //Do the SAD + vabs.s16 q15, q15 + vabd.s16 q14, q13, q12 + + vadd.u16 q0, q15, q14 + + vrhadd.u16 d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vmov.u32 r0, d0[0] + +WELS_ASM_FUNC_END + +#endif + + diff --git a/codec/encoder/core/arm/reconstruct_neon.S b/codec/encoder/core/arm/reconstruct_neon.S new file mode 100755 index 00000000..3a5964ae --- /dev/null +++ b/codec/encoder/core/arm/reconstruct_neon.S @@ -0,0 +1,1312 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + +#ifdef APPLE_IOS +.macro LORD_ALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, src*, src_stride + vld1.64 {$0}, [$4,:128], $5 + vld1.64 {$1}, [$4,:128], $5 + vld1.64 {$2}, [$4,:128], $5 + vld1.64 {$3}, [$4,:128], $5 +// } +.endm + +.macro STORE_ALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, dst*, dst_stride + vst1.64 {$0}, [$4,:128], $5 + vst1.64 {$1}, [$4,:128], $5 + vst1.64 {$2}, [$4,:128], $5 + vst1.64 {$3}, [$4,:128], $5 +// } +.endm + +.macro LORD_UNALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, src*, src_stride + vld1.64 {$0}, [$4], $5 + vld1.64 {$1}, [$4], $5 + vld1.64 {$2}, [$4], $5 + vld1.64 {$3}, [$4], $5 +// } +.endm + +.macro STORE_UNALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, dst*, dst_stride + vst1.64 {$0}, [$4], $5 + vst1.64 {$1}, [$4], $5 + vst1.64 {$2}, [$4], $5 + vst1.64 {$3}, [$4], $5 +// } +.endm + +.macro LOAD_4x4_DATA_FOR_DCT +// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride + vld2.16 {$0[0],$1[0]}, [$4], $5 + vld2.16 {$2[0],$3[0]}, [$6], $7 + vld2.16 {$0[1],$1[1]}, [$4], $5 + vld2.16 {$2[1],$3[1]}, [$6], $7 + + vld2.16 {$0[2],$1[2]}, [$4], $5 + vld2.16 {$2[2],$3[2]}, [$6], $7 + vld2.16 {$0[3],$1[3]}, [$4], $5 + vld2.16 {$2[3],$3[3]}, [$6], $7 +// } +.endm + +.macro LOAD_8x8_DATA_FOR_DCT +// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride + vld1.64 {$0}, [$8], r2 + vld1.64 {$4}, [$9], r4 + vld1.64 {$1}, [$8], r2 + vld1.64 {$5}, [$9], r4 + + vld1.64 {$2}, [$8], r2 + vld1.64 {$6}, [$9], r4 + vld1.64 {$3}, [$8], r2 + vld1.64 {$7}, [$9], r4 +// } +.endm + +.macro DCT_ROW_TRANSFORM_TOTAL_16BITS +// { // input: src_d[0]~[3], working: [4]~[7] + vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3]; + vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3]; + vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2]; + vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2]; + + vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1]; + vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1]; + vshl.s16 $1, $7, #1 + vshl.s16 $3, $6, #1 + vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2]; + vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1); +// } +.endm + +.macro MATRIX_TRANSFORM_EACH_16BITS +// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] + vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 + veor.s16 $6, $6 // init 0 , and keep 0; + vaba.s16 $1, $0, $6 // f + abs(coef - 0) + vmull.s16 $7, $2, $4 + vmull.s16 $8, $3, $5 + vshr.s32 $7, #16 + vshr.s32 $8, #16 + vmovn.s32 $2, $7 + vmovn.s32 $3, $8 + + vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $6, #1 + vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 + veor.s16 $6, $6 // init 0 , and keep 0; + vaba.s16 $1, $0, $6 // f + abs(coef - 0) + vmull.s16 $7, $2, $4 + vmull.s16 $8, $3, $5 + vshr.s32 $7, #16 + vshr.s32 $8, #16 + vmovn.s32 $2, $7 + vmovn.s32 $3, $8 + + vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $6, #1 + vmax.s16 $9, $2, $3 + vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), mf , working_d (all 0), working_q + vaba.s16 $1, $0, $3 // f + abs(coef - 0) + vmull.s16 $4, $1, $2 // *= mf + vshr.s32 $4, #16 + vmovn.s32 $1, $4 // >> 16 + + vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $3, #1 + vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro DC_ZERO_COUNT_IN_DUALWORD +// { // input: coef, dst_d, working_d (all 0x01) + vceq.s16 $1, $0, #0 + vand.s16 $1, $2 + vpadd.s16 $1, $1, $1 + vpadd.s16 $1, $1, $1 +// } +.endm + +.macro SELECT_MAX_IN_ABS_COEF +// { // input: coef_0, coef_1, max_q (identy to follow two) + vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4 + vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3] + vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] +// } +.endm + +.macro ZERO_COUNT_IN_2_QUARWORD +// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q + vceq.s16 $0, #0 + vceq.s16 $1, #0 + vand.s16 $0, $2 + vand.s16 $1, $2 + + vpadd.s16 $3, $3, $5 + vpadd.s16 $4, $4, $6 + vpadd.s16 $3, $3, $4 // 8-->4 + vpadd.s16 $3, $3, $3 + vpadd.s16 $3, $3, $3 +// } +.endm + +.macro HDM_QUANT_2x2_TOTAL_16BITS +// { // input: src_d[0]~[3], working_d, dst_d + vshr.s64 $1, $0, #32 + vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; + vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; + vtrn.s16 $2, $1 + vtrn.s32 $2, $1 +// } +.endm + +.macro IHDM_4x4_TOTAL_16BITS +// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 + vshr.s64 $1, $0, #32 + vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; + vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; + vtrn.s16 $2, $1 + vrev32.16 $1, $1 + vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; + + vrev64.16 $1, $2 + vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; + vsub.s16 $1, $2, $1 + vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; + vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; +// } +.endm + +.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP +// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; + vmovl.u8 $4,$0 + vmovl.u8 $5,$1 + vadd.s16 $4,$2 + vadd.s16 $5,$3 + vqmovun.s16 $0,$4 + vqmovun.s16 $1,$5 +// } +.endm + +.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS +// { // input: src_d[0]~[3], output: e_d[0]~[3]; + vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2]; + vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2]; + vshr.s16 $6, $1, #1 + vshr.s16 $7, $3, #1 + vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3]; + vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_TOTAL_16BITS // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + + +.macro ROW_TRANSFORM_0_STEP +// { // input: src_d[0]~[3], output: e_q[0]~[3]; + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3]; + vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3]; +// } +.endm + +.macro ROW_TRANSFORM_1_STEP +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 $8, $1, #1 + vshr.s16 $9, $3, #1 + vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_4BYTES // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + +.macro COL_TRANSFORM_0_STEP +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm + +.macro COL_TRANSFORM_1_STEP +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 $6, $1, #1 + vshr.s32 $7, $3, #1 + vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm +#else +.macro LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, src*, src_stride + vld1.64 {\arg0}, [\arg4,:128], \arg5 + vld1.64 {\arg1}, [\arg4,:128], \arg5 + vld1.64 {\arg2}, [\arg4,:128], \arg5 + vld1.64 {\arg3}, [\arg4,:128], \arg5 +// } +.endm + +.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, dst*, dst_stride + vst1.64 {\arg0}, [\arg4,:128], \arg5 + vst1.64 {\arg1}, [\arg4,:128], \arg5 + vst1.64 {\arg2}, [\arg4,:128], \arg5 + vst1.64 {\arg3}, [\arg4,:128], \arg5 +// } +.endm + +.macro LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, src*, src_stride + vld1.64 {\arg0}, [\arg4], \arg5 + vld1.64 {\arg1}, [\arg4], \arg5 + vld1.64 {\arg2}, [\arg4], \arg5 + vld1.64 {\arg3}, [\arg4], \arg5 +// } +.endm + +.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, dst*, dst_stride + vst1.64 {\arg0}, [\arg4], \arg5 + vst1.64 {\arg1}, [\arg4], \arg5 + vst1.64 {\arg2}, [\arg4], \arg5 + vst1.64 {\arg3}, [\arg4], \arg5 +// } +.endm + +.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride + vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5 + vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7 + vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5 + vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7 + + vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5 + vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7 + vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5 + vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7 +// } +.endm + +.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride + vld1.64 {\arg0}, [\arg8], r2 + vld1.64 {\arg4}, [\arg9], r4 + vld1.64 {\arg1}, [\arg8], r2 + vld1.64 {\arg5}, [\arg9], r4 + + vld1.64 {\arg2}, [\arg8], r2 + vld1.64 {\arg6}, [\arg9], r4 + vld1.64 {\arg3}, [\arg8], r2 + vld1.64 {\arg7}, [\arg9], r4 +// } +.endm + +.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_d[0]~[3], working: [4]~[7] + vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3]; + vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3]; + vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2]; + vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2]; + + vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1]; + vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1]; + vshl.s16 \arg1, \arg7, #1 + vshl.s16 \arg3, \arg6, #1 + vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2]; + vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1); +// } +.endm + +.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3 +// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] + vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 + veor.s16 \arg6, \arg6 // init 0 , and keep 0; + vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) + vmull.s16 \arg7, \arg2, \arg4 + vmull.s16 \arg8, \arg3, \arg5 + vshr.s32 \arg7, #16 + vshr.s32 \arg8, #16 + vmovn.s32 \arg2, \arg7 + vmovn.s32 \arg3, \arg8 + + vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg6, #1 + vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 + veor.s16 \arg6, \arg6 // init 0 , and keep 0; + vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) + vmull.s16 \arg7, \arg2, \arg4 + vmull.s16 \arg8, \arg3, \arg5 + vshr.s32 \arg7, #16 + vshr.s32 \arg8, #16 + vmovn.s32 \arg2, \arg7 + vmovn.s32 \arg3, \arg8 + + vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg6, #1 + vmax.s16 \arg9, \arg2, \arg3 + vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 +// { // input: coef, ff (dst), mf , working_d (all 0), working_q + vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0) + vmull.s16 \arg4, \arg1, \arg2 // *= mf + vshr.s32 \arg4, #16 + vmovn.s32 \arg1, \arg4 // >> 16 + + vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg3, #1 + vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 +// { // input: coef, dst_d, working_d (all 0x01) + vceq.s16 \arg1, \arg0, #0 + vand.s16 \arg1, \arg2 + vpadd.s16 \arg1, \arg1, \arg1 + vpadd.s16 \arg1, \arg1, \arg1 +// } +.endm + +.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4 +// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1 + vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4 + vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3] + vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] +// } +.endm + +.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6 +// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q + vceq.s16 \arg0, #0 + vceq.s16 \arg1, #0 + vand.s16 \arg0, \arg2 + vand.s16 \arg1, \arg2 + + vpadd.s16 \arg3, \arg3, \arg5 + vpadd.s16 \arg4, \arg4, \arg6 + vpadd.s16 \arg3, \arg3, \arg4 // 8-->4 + vpadd.s16 \arg3, \arg3, \arg3 + vpadd.s16 \arg3, \arg3, \arg3 +// } +.endm + +.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 +// { // input: src_d[0]~[3], working_d, dst_d + vshr.s64 \arg1, \arg0, #32 + vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; + vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; + vtrn.s16 \arg2, \arg1 + vtrn.s32 \arg2, \arg1 +// } +.endm + +.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 +// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 + vshr.s64 \arg1, \arg0, #32 + vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; + vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; + vtrn.s16 \arg2, \arg1 + vrev32.16 \arg1, \arg1 + vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; + + vrev64.16 \arg1, \arg2 + vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; + vsub.s16 \arg1, \arg2, \arg1 + vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; + vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; +// } +.endm + +.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; + vmovl.u8 \arg4,\arg0 + vmovl.u8 \arg5,\arg1 + vadd.s16 \arg4,\arg2 + vadd.s16 \arg5,\arg3 + vqmovun.s16 \arg0,\arg4 + vqmovun.s16 \arg1,\arg5 +// } +.endm + +.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_d[0]~[3], output: e_d[0]~[3]; + vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2]; + vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2]; + vshr.s16 \arg6, \arg1, #1 + vshr.s16 \arg7, \arg3, #1 + vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3]; + vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + + +.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_d[0]~[3], output: e_q[0]~[3]; + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3]; + vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3]; +// } +.endm + +.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9 + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 \arg8, \arg1, #1 + vshr.s16 \arg9, \arg3, #1 + vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + +.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm + +.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 \arg6, \arg1, #1 + vshr.s32 \arg7, \arg3, #1 + vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm +#endif + + +WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon + + LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + + LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + +WELS_ASM_FUNC_END + + + +WELS_ASM_FUNC_BEGIN WelsDctT4_neon + push {r4} + ldr r4, [sp, #4] + + LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4 + + vsubl.u8 q0, d4, d6 + vsubl.u8 q1, d5, d7 + vtrn.s32 q0, q1 + vswp d1, d2 + + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + vst1.s16 {q0, q1}, [r0]! + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon + push {r4} + ldr r4, [sp, #4] + + LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3 + + vsubl.u8 q0, d8, d12 + vsubl.u8 q1, d9, d13 + vsubl.u8 q2, d10, d14 + vsubl.u8 q3, d11, d15 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + vswp d1, d2 + vswp d5, d6 + vswp q1, q2 + vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q2, q3}, [r0]! + + //////////////// + LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3 + + vsubl.u8 q0, d8, d12 + vsubl.u8 q1, d9, d13 + vsubl.u8 q2, d10, d14 + vsubl.u8 q3, d11, d15 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + vswp d1, d2 + vswp d5, d6 + vswp q1, q2 + vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q2, q3}, [r0]! + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon + vld1.s16 {q2}, [r1] + vld1.s16 {q0, q1}, [r0] + vld1.s16 {q3}, [r2] + + vmov q4, q2 + + NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7 + vst1.s16 {q2}, [r0]! + + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r0]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon + + vld1.s16 {q0, q1}, [r0] + vdup.s16 q2, r1 // even ff range [0, 768] + vdup.s16 q3, r2 + + vmov q4, q2 + + NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7 + vst1.s16 {q2}, [r0]! + + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r0]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon + vld1.s16 {q2}, [r1] + vld1.s16 {q3}, [r2] + mov r1, r0 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon + vld1.s16 {q2}, [r1] + vld1.s16 {q3}, [r2] + mov r1, r0 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20 + vst1.s16 {q8}, [r1]! // then 1st 16 elem in d18 & d20 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21 + vst1.s16 {q8}, [r1]! // then 2nd 16 elem in d19 & d21 + + SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1 + vst1.s32 {d0[0]}, [r3]! + + /////////// + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20 + vst1.s16 {q8}, [r1]! // then 3rd 16 elem in d18 & d20 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21 + vst1.s16 {q8}, [r1]! // then 4th 16 elem in d19 & d21 + + SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1 + vst1.s32 {d0[0]}, [r3]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon + push {r2,r3} + mov r2, #64 // 2*16*sizeof(int16_t) + add r3, r1, #32 + + vld1.s16 {d0}, [r1], r2 + vld1.s16 {d1}, [r3], r2 + vld1.s16 {d4}, [r1], r2 + vld1.s16 {d5}, [r3], r2 + vld1.s16 {d2}, [r1], r2 + vld1.s16 {d3}, [r3], r2 + vld1.s16 {d6}, [r1], r2 + vld1.s16 {d7}, [r3], r2 + vtrn.16 q0, q2 // d0[0 4], d1[1 5] + vtrn.16 q1, q3 // d2[2 6], d3[3 7] + + vld1.s16 {d8}, [r1], r2 + vld1.s16 {d9}, [r3], r2 + vld1.s16 {d12}, [r1], r2 + vld1.s16 {d13}, [r3], r2 + vld1.s16 {d10}, [r1], r2 + vld1.s16 {d11}, [r3], r2 + vld1.s16 {d14}, [r1], r2 + vld1.s16 {d15}, [r3], r2 + vtrn.16 q4, q6 // d8[08 12], d9[09 13] + vtrn.16 q5, q7 //d10[10 14],d11[11 15] + + vtrn.32 q0, q4 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16] + vtrn.32 q1, q5 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80] + + ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q4, q7, q6, q5 + + TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5 + + // transform element 32bits + vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] + + COL_TRANSFORM_0_STEP q0, q1, q3, q2, q4, q7, q6, q5 + + TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5 + + vrshrn.s32 d8, q0, #1 + vrshrn.s32 d9, q1, #1 + vrshrn.s32 d10, q2, #1 + vrshrn.s32 d11, q3, #1 + vst1.16 {q4, q5}, [r0] //store + + pop {r2,r3} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon + + vdup.s16 d1, r1 //ff + vdup.s16 d2, r2 //mf + veor d3, d3 + + mov r1, #32 + mov r2, r0 + + vld1.s16 {d0[0]}, [r0], r1 //rs[00] + vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0 + vld1.s16 {d0[1]}, [r0], r1 //rs[16] + vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0 + vld1.s16 {d0[2]}, [r0], r1 //rs[32] + vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0 + vld1.s16 {d0[3]}, [r0], r1 //rs[48] + vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0 + + HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5 + + HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0 + + QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2 + + vst1.s16 d1, [r3] // store to dct + ldr r2, [sp, #0] + vst1.s16 d1, [r2] // store to block + + mov r1, #1 + vdup.s16 d3, r1 + DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3 + + vmov r0, r1, d0 + and r0, #0x07 // range [0~4] + rsb r0, #4 +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon + + vdup.s16 d3, r1 + mov r1, #32 + vld1.s16 {d0[0]}, [r0], r1 //rs[00] + vld1.s16 {d0[1]}, [r0], r1 //rs[16] + vld1.s16 {d0[2]}, [r0], r1 //rs[32] + vld1.s16 {d0[3]}, [r0], r1 //rs[48] + + HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2 + + HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0 + + vabs.s16 d1, d0 + vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold; + vmov r0, r1, d1 + orr r0, r1 +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon + push {r1} + vld1.s16 {q0, q1}, [r0] + vmov.s16 q8, #1 + + ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3 + vmov r0, r1, d0 + and r0, #0x1F // range [0~16] + rsb r0, #16 + pop {r1} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon + vld1.s16 {q0, q1}, [r0] + vld1.u16 {q2}, [r1] + + vmul.s16 q4, q0, q2 + vmul.s16 q5, q1, q2 + + vst1.s16 {q4, q5}, [r0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon + vld1.u16 {q8}, [r1] + mov r1, r0 + vld1.s16 {q0, q1}, [r0]! + vld1.s16 {q2, q3}, [r0]! + vmul.s16 q0, q0, q8 + vld1.s16 {q4, q5}, [r0]! + vmul.s16 q1, q1, q8 + vld1.s16 {q6, q7}, [r0]! + + vst1.s16 {q0, q1}, [r1]! + + vmul.s16 q2, q2, q8 + vmul.s16 q3, q3, q8 + vmul.s16 q4, q4, q8 + vst1.s16 {q2, q3}, [r1]! + + vmul.s16 q5, q5, q8 + vmul.s16 q6, q6, q8 + vmul.s16 q7, q7, q8 + vst1.s16 {q4, q5}, [r1]! + vst1.s16 {q6, q7}, [r1]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon + + vld1.s16 {q0, q1}, [r0] + vdup.s16 q4, r1 + + IHDM_4x4_TOTAL_16BITS q0, q2, q3 + IHDM_4x4_TOTAL_16BITS q1, q2, q3 + + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + IHDM_4x4_TOTAL_16BITS q0, q2, q3 + vmul.s16 q0, q4 + + IHDM_4x4_TOTAL_16BITS q1, q2, q3 + vmul.s16 q1, q4 + + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + vst1.s16 {q0, q1}, [r0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon + vld1.u32 {d14[0]}, [r2], r3 + push {r4} + ldr r4, [sp, #4] + vld1.u32 {d14[1]}, [r2], r3 + + vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles! + vld1.u32 {d15[0]}, [r2], r3 + vld1.u32 {d15[1]}, [r2], r3 // q7 is pred + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + vrshr.s16 d0, d0, #6 + vrshr.s16 d1, d1, #6 + vrshr.s16 d2, d2, #6 + vrshr.s16 d3, d3, #6 + + //after rounding 6, clip into [0, 255] + vmovl.u8 q2,d14 + vadd.s16 q0,q2 + vqmovun.s16 d14,q0 + vst1.32 {d14[0]},[r0],r1 + vst1.32 {d14[1]},[r0],r1 + + vmovl.u8 q2,d15 + vadd.s16 q1,q2 + vqmovun.s16 d15,q1 + vst1.32 {d15[0]},[r0],r1 + vst1.32 {d15[1]},[r0] + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon + + vld1.u64 {d16}, [r2], r3 + push {r4} + ldr r4, [sp, #4] + vld1.u64 {d17}, [r2], r3 + + vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! + vld1.u64 {d18}, [r2], r3 + vld1.u64 {d19}, [r2], r3 + vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! + vswp d1, d4 + vswp d3, d6 + vswp q1, q2 // q0~q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + vrshr.s16 q2, q2, #6 + vrshr.s16 q3, q3, #6 + + //after rounding 6, clip into [0, 255] + vmovl.u8 q4,d16 + vadd.s16 q0,q4 + vqmovun.s16 d16,q0 + vst1.u8 {d16},[r0],r1 + + vmovl.u8 q4,d17 + vadd.s16 q1,q4 + vqmovun.s16 d17,q1 + vst1.u8 {d17},[r0],r1 + + vmovl.u8 q4,d18 + vadd.s16 q2,q4 + vqmovun.s16 d18,q2 + vst1.u8 {d18},[r0],r1 + + vmovl.u8 q4,d19 + vadd.s16 q3,q4 + vqmovun.s16 d19,q3 + vst1.u8 {d19},[r0],r1 + + vld1.u64 {d16}, [r2], r3 + vld1.u64 {d17}, [r2], r3 + + vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! + vld1.u64 {d18}, [r2], r3 + vld1.u64 {d19}, [r2], r3 + vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! + vswp d1, d4 + vswp d3, d6 + vswp q1, q2 // q0~q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + vrshr.s16 q2, q2, #6 + vrshr.s16 q3, q3, #6 + + //after rounding 6, clip into [0, 255] + vmovl.u8 q4,d16 + vadd.s16 q0,q4 + vqmovun.s16 d16,q0 + vst1.u8 {d16},[r0],r1 + + vmovl.u8 q4,d17 + vadd.s16 q1,q4 + vqmovun.s16 d17,q1 + vst1.u8 {d17},[r0],r1 + + vmovl.u8 q4,d18 + vadd.s16 q2,q4 + vqmovun.s16 d18,q2 + vst1.u8 {d18},[r0],r1 + + vmovl.u8 q4,d19 + vadd.s16 q3,q4 + vqmovun.s16 d19,q3 + vst1.u8 {d19},[r0],r1 + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon + push {r4} + ldr r4, [sp, #4] + + vld1.s16 {q8,q9}, [r4] + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + + vdup.s16 d20, d16[0] + vdup.s16 d21, d16[1] + vdup.s16 d22, d16[2] + vdup.s16 d23, d16[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vdup.s16 d20, d17[0] + vdup.s16 d21, d17[1] + vdup.s16 d22, d17[2] + vdup.s16 d23, d17[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vdup.s16 d20, d18[0] + vdup.s16 d21, d18[1] + vdup.s16 d22, d18[2] + vdup.s16 d23, d18[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vdup.s16 d20, d19[0] + vdup.s16 d21, d19[1] + vdup.s16 d22, d19[2] + vdup.s16 d23, d19[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + pop {r4} +WELS_ASM_FUNC_END +#endif diff --git a/codec/encoder/core/inc/deblocking.h b/codec/encoder/core/inc/deblocking.h index f5b7beaf..25e6430c 100644 --- a/codec/encoder/core/inc/deblocking.h +++ b/codec/encoder/core/inc/deblocking.h @@ -61,6 +61,16 @@ uint8_t uiFilterIdc; uint8_t uiReserved; } SDeblockingFilter; +#if defined(__cplusplus) +extern "C" { +#endif//__cplusplus +#if defined(HAVE_NEON) +void WelsNonZeroCount_neon(int8_t * pNonZeroCount); +void DeblockingBSCalcEnc_neon(int8_t *pNzc, SMVUnitXY *pMv, int32_t iBoundryFlag, int32_t iMbStride, uint8_t (*pBS)[4][4]); +#endif +#if defined(__cplusplus) +} +#endif//__cplusplus void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu); void WelsNonZeroCount_c (int8_t* pNonZeroCount); diff --git a/codec/encoder/core/inc/decode_mb_aux.h b/codec/encoder/core/inc/decode_mb_aux.h index 6766ab03..2c141da5 100644 --- a/codec/encoder/core/inc/decode_mb_aux.h +++ b/codec/encoder/core/inc/decode_mb_aux.h @@ -70,6 +70,16 @@ void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPredict int16_t* pDctDc); #endif//X86_ASM +#ifdef HAVE_NEON +void WelsDequantFour4x4_neon(int16_t* pDct, const uint16_t* kpMF); +void WelsDequant4x4_neon(int16_t* pDct, const uint16_t* kpMF); +void WelsDequantIHadamard4x4_neon(int16_t* pRes, const uint16_t kuiMF); + +void WelsIDctT4Rec_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct); +void WelsIDctFourT4Rec_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct); +void WelsIDctRecI16x16Dc_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc); +#endif + #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/encoder/core/inc/encode_mb_aux.h b/codec/encoder/core/inc/encode_mb_aux.h index ab878e1c..225e7c16 100644 --- a/codec/encoder/core/inc/encode_mb_aux.h +++ b/codec/encoder/core/inc/encode_mb_aux.h @@ -122,6 +122,28 @@ void WelsCopy16x16_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t void WelsCopy16x16NotAligned_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS); #endif +#ifdef HAVE_NEON +void WelsCopy8x8_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ); +void WelsCopy16x16_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ); +void WelsCopy16x16NotAligned_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ); +void WelsCopy16x8NotAligned_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ); +void WelsCopy8x16_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ); + +void WelsHadamardT4Dc_neon(int16_t* pLumaDc, int16_t* pDct); +int32_t WelsHadamardQuant2x2_neon(int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock); +int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF); +int32_t WelsHadamardQuant2x2SkipKernel_neon(int16_t *pRes, int16_t iThreshold);// avoid divide operator + +void WelsDctT4_neon(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); +void WelsDctFourT4_neon(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2); + +int32_t WelsGetNoneZeroCount_neon(int16_t* pLevel); + +void WelsQuant4x4_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF); +void WelsQuant4x4Dc_neon(int16_t* pDct, int16_t iFF, int16_t iMF); +void WelsQuantFour4x4_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF); +void WelsQuantFour4x4Max_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax); +#endif #if defined(__cplusplus) } diff --git a/codec/encoder/core/inc/encoder.h b/codec/encoder/core/inc/encoder.h index f1cf1db3..a2f1e84c 100644 --- a/codec/encoder/core/inc/encoder.h +++ b/codec/encoder/core/inc/encoder.h @@ -124,6 +124,8 @@ void WelsSetMemZeroAligned64_sse2 (void* pDst, int32_t iSize); void WelsSetMemZeroSize64_mmx (void* pDst, int32_t iSize); void WelsSetMemZeroSize8_mmx (void* pDst, int32_t iSize); void WelsPrefetchZero_mmx (int8_t const* kpDst); +#elif defined(HAVE_NEON) +void WelsSetMemZero_neon(void* pDst, int32_t iSize); #endif #if defined(__cplusplus) diff --git a/codec/encoder/core/src/deblocking.cpp b/codec/encoder/core/src/deblocking.cpp index 3076ffde..de96bb72 100644 --- a/codec/encoder/core/src/deblocking.cpp +++ b/codec/encoder/core/src/deblocking.cpp @@ -605,6 +605,23 @@ void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilte DeblockingIntraMb (&pFunc->pfDeblocking, pCurMb, pFilter); break; default: +#if (defined(HAVE_NEON) && defined(SINGLE_REF_FRAME)) + DeblockingBSCalcEnc_neon(pCurMb->pNonZeroCount, pCurMb->sMv, pCurMb->uiNeighborAvail, iMbStride, uiBS); + if (iLeftFlag){ + if (IS_INTRA((pCurMb-1)->uiMbType)) { + *(uint32_t*)uiBS[0][0] = 0x04040404; + } + } else { + *(uint32_t*)uiBS[0][0] = 0; + } + if (iTopFlag) { + if (IS_INTRA((pCurMb-iMbStride)->uiMbType)) { + *(uint32_t*)uiBS[1][0] = 0x04040404; + } + } else { + *(uint32_t*)uiBS[1][0] = 0; + } +#else if (iLeftFlag) { * (uint32_t*)uiBS[0][0] = IS_INTRA ((pCurMb - 1)->uiMbType) ? 0x04040404 : DeblockingBSMarginalMBAvcbase (pCurMb, pCurMb - 1, 0); @@ -630,7 +647,7 @@ void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilte * (uint32_t*)uiBS[0][1] = * (uint32_t*)uiBS[0][2] = * (uint32_t*)uiBS[0][3] = * (uint32_t*)uiBS[1][1] = * (uint32_t*)uiBS[1][2] = * (uint32_t*)uiBS[1][3] = 0; } - +#endif DeblockingInterMb (&pFunc->pfDeblocking, pCurMb, pFilter, uiBS); break; } @@ -768,10 +785,13 @@ void WelsNonZeroCount_c (int8_t* pNonZeroCount) { } void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) { *pfSetNZCZero = WelsNonZeroCount_c; +#ifdef HAVE_NEON + if( iCpu & WELS_CPU_NEON ) { + *pfSetNZCZero = WelsNonZeroCount_neon; + } +#endif } - - void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) { pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_c; pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_c; @@ -796,6 +816,20 @@ void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) { pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_ssse3; } #endif + +#if defined(HAVE_NEON) + if (iCpu & WELS_CPU_NEON ) { + pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon; + pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon; + pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon; + pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon; + + pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon; + pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon; + pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon; + pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_neon; + } +#endif } diff --git a/codec/encoder/core/src/decode_mb_aux.cpp b/codec/encoder/core/src/decode_mb_aux.cpp index 5bdfba42..087ba9d3 100644 --- a/codec/encoder/core/src/decode_mb_aux.cpp +++ b/codec/encoder/core/src/decode_mb_aux.cpp @@ -270,5 +270,17 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2; } #endif//X86_ASM + +#if defined(HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->pfDequantization4x4 = WelsDequant4x4_neon; + pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_neon; + pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_neon; + + pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_neon; + pFuncList->pfIDctT4 = WelsIDctT4Rec_neon; + pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon; + } +#endif } } diff --git a/codec/encoder/core/src/encode_mb_aux.cpp b/codec/encoder/core/src/encode_mb_aux.cpp index b595d99f..258d2248 100644 --- a/codec/encoder/core/src/encode_mb_aux.cpp +++ b/codec/encoder/core/src/encode_mb_aux.cpp @@ -508,6 +508,13 @@ int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) { return (16 - iCnt); } +#ifdef HAVE_NEON +int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF) { + int16_t iThreshold = ((1<<16)-1)/iMF - iFF; + return WelsHadamardQuant2x2SkipKernel_neon(pRes, iThreshold); +} +#endif + void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c; pFuncList->pfCopy16x16Aligned = @@ -571,5 +578,28 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { //#endif//MACOS #endif//X86_ASM + +#if defined(HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_neon; + pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_neon; + pFuncList->pfDctT4 = WelsDctT4_neon; + pFuncList->pfCopy8x8Aligned = WelsCopy8x8_neon; + pFuncList->pfCopy8x16Aligned = WelsCopy8x16_neon; + + pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_neon; + pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon; + + pFuncList->pfQuantization4x4 = WelsQuant4x4_neon; + pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_neon; + pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_neon; + pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon; + + pFuncList->pfCopy16x16Aligned = WelsCopy16x16_neon; + pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_neon; + pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_neon; + pFuncList->pfDctFourT4 = WelsDctFourT4_neon; + } +#endif } } diff --git a/codec/encoder/core/src/encoder.cpp b/codec/encoder/core/src/encoder.cpp index c4e8e692..5a49372c 100644 --- a/codec/encoder/core/src/encoder.cpp +++ b/codec/encoder/core/src/encoder.cpp @@ -171,6 +171,14 @@ int32_t InitFunctionPointers (SWelsFuncPtrList* pFuncList, SWelsSvcCodingParam* } #endif//X86_ASM +#if defined(HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->pfSetMemZeroSize8 = WelsSetMemZero_neon; + pFuncList->pfSetMemZeroSize64Aligned16 = WelsSetMemZero_neon; + pFuncList->pfSetMemZeroSize64 = WelsSetMemZero_neon; + } +#endif + InitExpandPictureFunc (pFuncList, uiCpuFlag); /* Intra_Prediction_fn*/ diff --git a/codec/encoder/core/src/encoder_ext.cpp b/codec/encoder/core/src/encoder_ext.cpp index a7f17682..96fb49e7 100644 --- a/codec/encoder/core/src/encoder_ext.cpp +++ b/codec/encoder/core/src/encoder_ext.cpp @@ -1944,6 +1944,13 @@ int32_t WelsInitEncoderExt (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingPar else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_16) iCacheLineSize = 16; OutputCpuFeaturesLog (uiCpuFeatureFlags, uiCpuCores, iCacheLineSize); +#elif defined(HAVE_NEON) +#if defined(ANDROID_NDK) + uiCpuFeatureFlags = WelsCPUFeatureDetectAndroid(); +#endif +#if defined(APPLE_IOS) + uiCpuFeatureFlags = WelsCPUFeatureDetectIOS(); +#endif #else iCacheLineSize = 16; // 16 bytes aligned in default #endif//X86_ASM diff --git a/codec/encoder/core/src/expand_pic.cpp b/codec/encoder/core/src/expand_pic.cpp index eec5476b..06333d14 100644 --- a/codec/encoder/core/src/expand_pic.cpp +++ b/codec/encoder/core/src/expand_pic.cpp @@ -129,6 +129,13 @@ void InitExpandPictureFunc (void* pL, const uint32_t kuiCPUFlag) { pFuncList->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2; } #endif//X86_ASM +#if defined(X86_ASM) + if (kuiCPUFlag & WELS_CPU_NEON) { + pFuncList->pfExpandLumaPicture = ExpandPictureLuma_neon; + pFuncList->pfExpandChromaPicture[0] = ExpandPictureChroma_c; + pFuncList->pfExpandChromaPicture[1] = ExpandPictureChroma_neon; + } +#endif//X86_ASM } diff --git a/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj b/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj index 6febc1a9..f59423cb 100644 --- a/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj +++ b/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj @@ -7,6 +7,10 @@ objects = { /* Begin PBXBuildFile section */ + 4C34067818C5A4AD00DFA14A /* adaptive_quantization.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */; }; + 4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */; }; + 4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */; }; + 4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */; }; 4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; }; 4CE4444318B724B60017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4444218B724B60017DF25 /* XCTest.framework */; }; 4CE4444418B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; }; @@ -56,6 +60,10 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = adaptive_quantization.S; sourceTree = ""; }; + 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = down_sample_neon.S; sourceTree = ""; }; + 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_sad_neon.S; sourceTree = ""; }; + 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = ""; }; 4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; }; 4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 4CE4444118B724B60017DF25 /* processingTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = processingTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -122,6 +130,17 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ + 4C34067318C5A4AD00DFA14A /* arm */ = { + isa = PBXGroup; + children = ( + 4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */, + 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */, + 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */, + 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */, + ); + path = arm; + sourceTree = ""; + }; 4CE4442818B724B60017DF25 = { isa = PBXGroup; children = ( @@ -182,6 +201,7 @@ 4CE4475B18BC62960017DF25 /* src */ = { isa = PBXGroup; children = ( + 4C34067318C5A4AD00DFA14A /* arm */, 4CE4475C18BC62960017DF25 /* adaptivequantization */, 4CE4476318BC62960017DF25 /* backgrounddetection */, 4CE4476618BC62960017DF25 /* common */, @@ -372,6 +392,8 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */, + 4C34067818C5A4AD00DFA14A /* adaptive_quantization.S in Sources */, 4CE4479718BC62960017DF25 /* downsample.cpp in Sources */, 4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */, 4CE4479918BC62960017DF25 /* imagerotate.cpp in Sources */, @@ -380,6 +402,7 @@ 4CE4479A18BC62960017DF25 /* imagerotatefuncs.cpp in Sources */, 4CE4479518BC62960017DF25 /* denoise.cpp in Sources */, 4CE4479218BC62960017DF25 /* WelsFrameWork.cpp in Sources */, + 4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */, 4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */, 4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */, 4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */, @@ -387,6 +410,7 @@ 4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */, 4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */, 4CE4479118BC62960017DF25 /* thread.cpp in Sources */, + 4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */, 4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */, 4CE4479618BC62960017DF25 /* denoise_filter.cpp in Sources */, ); @@ -502,6 +526,11 @@ DSTROOT = /tmp/processing.dst; GCC_C_LANGUAGE_STANDARD = "compiler-default"; GCC_OPTIMIZATION_LEVEL = 3; + "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*]" = ( + APPLE_IOS, + HAVE_NEON, + ); + "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphonesimulator*]" = APPLE_IOS; HEADER_SEARCH_PATHS = ( /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include, "\"$(SRCROOT)/../../../common\"", @@ -526,6 +555,11 @@ CODE_SIGN_IDENTITY = "iPhone Developer"; DSTROOT = /tmp/processing.dst; GCC_C_LANGUAGE_STANDARD = "compiler-default"; + "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*]" = ( + APPLE_IOS, + HAVE_NEON, + ); + "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphonesimulator*]" = APPLE_IOS; HEADER_SEARCH_PATHS = ( /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include, "\"$(SRCROOT)/../../../common\"", diff --git a/codec/processing/src/arm/.DS_Store b/codec/processing/src/arm/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0