diff --git a/codec/build/iOS/common/common.xcodeproj/project.pbxproj b/codec/build/iOS/common/common.xcodeproj/project.pbxproj index e5167f8b..ff32a4e8 100644 --- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj +++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj @@ -19,6 +19,8 @@ 4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473818BC61650017DF25 /* deblocking_common.cpp */; }; 4CE4475218BC61650017DF25 /* logging.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473C18BC61650017DF25 /* logging.cpp */; }; 4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */; }; + 4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BC18C085320017DF25 /* deblocking_neon.S */; }; + 4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -69,6 +71,8 @@ 4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = ""; }; 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = ""; }; 4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = ""; }; + 4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = ""; }; + 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -144,6 +148,8 @@ 4CE4472F18BC61650017DF25 /* common */ = { isa = PBXGroup; children = ( + 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */, + 4CE447BC18C085320017DF25 /* deblocking_neon.S */, 4CE4473118BC61650017DF25 /* cpu.cpp */, 4CE4473218BC61650017DF25 /* cpu.h */, 4CE4473318BC61650017DF25 /* cpu_core.h */, @@ -247,9 +253,11 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */, 4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */, 4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */, 4CE4475218BC61650017DF25 /* logging.cpp in Sources */, + 4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */, 4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */, 4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */, ); diff --git a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj index fa1fa4da..518aaddc 100644 --- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj +++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj @@ -36,9 +36,7 @@ 4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; }; 4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; }; 4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; }; - 4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */; }; 4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; }; - 4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */; }; 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; }; 4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; }; /* End PBXBuildFile section */ @@ -132,9 +130,7 @@ 4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = ""; }; 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = ""; }; 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = ""; }; - 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = ""; }; 4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = ""; }; - 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = ""; }; 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = ""; }; 4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = ""; }; /* End PBXFileReference section */ @@ -327,9 +323,7 @@ 4CE447A518BC6BE90017DF25 /* arm */ = { isa = PBXGroup; children = ( - 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */, 4CE447A718BC6BE90017DF25 /* block_add_neon.S */, - 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */, 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */, 4CE447AA18BC6BE90017DF25 /* mc_neon.S */, ); @@ -424,7 +418,6 @@ 4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */, 4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */, 4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */, - 4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */, 4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */, 4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */, 4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */, @@ -435,7 +428,6 @@ 4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */, 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */, 4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */, - 4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */, 4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */, 4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */, 4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */, diff --git a/codec/decoder/core/arm/arm_arch_common_macro.S b/codec/common/arm_arch_common_macro.S similarity index 100% rename from codec/decoder/core/arm/arm_arch_common_macro.S rename to codec/common/arm_arch_common_macro.S diff --git a/codec/common/cpu.cpp b/codec/common/cpu.cpp index 6cc85f38..03049094 100644 --- a/codec/common/cpu.cpp +++ b/codec/common/cpu.cpp @@ -38,7 +38,12 @@ ************************************************************************************* */ #include - +#ifdef ANDROID_NDK +#include +#endif +#ifdef APPLE_IOS +#include +#endif #include "cpu.h" #include "cpu_core.h" @@ -209,4 +214,53 @@ void WelsXmmRegEmptyOp(void * pSrc) { #endif +#if defined(HAVE_NEON)//For supporting both android platform and iOS platform +#if defined(ANDROID_NDK) +uint32_t WelsCPUFeatureDetectAndroid() +{ + uint32_t uiCPU = 0; + AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN; + uint64_t uiFeatures = 0; + + cpuFamily = android_getCpuFamily(); + if (cpuFamily == ANDROID_CPU_FAMILY_ARM) + { + uiFeatures = android_getCpuFeatures(); + if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){ + uiCPU |= WELS_CPU_ARMv7; + } + if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){ + uiCPU |= WELS_CPU_VFPv3; + } + if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){ + uiCPU |= WELS_CPU_NEON; + } + } + return uiCPU; +} + +#endif + +#if defined(APPLE_IOS) +uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of APPLE +{ + uint32_t uiCPU = 0; + struct utsname sSystemInfo; + + uname (&sSystemInfo); + + if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G + (0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G + (0 != strcmp(sSystemInfo.machine, "iPod1,1")) && //iPod 1G + (0 != strcmp(sSystemInfo.machine, "iPod2,1"))) //iPod 2G + { + uiCPU |= WELS_CPU_ARMv7; + uiCPU |= WELS_CPU_VFPv3; + uiCPU |= WELS_CPU_NEON; + } + return uiCPU; +} +#endif +#endif + diff --git a/codec/common/cpu.h b/codec/common/cpu.h index a119833e..fc458ca2 100644 --- a/codec/common/cpu.h +++ b/codec/common/cpu.h @@ -78,6 +78,16 @@ void WelsXmmRegLoad(void * src); void WelsXmmRegEmptyOp(void * pSrc); +#if defined(HAVE_NEON) +#if defined(ANDROID_NDK) + uint32_t WelsCPUFeatureDetectAndroid(); +#endif + +#if defined(APPLE_IOS) + uint32_t WelsCPUFeatureDetectIOS(); +#endif +#endif + #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/common/cpu_core.h b/codec/common/cpu_core.h index 27fa5245..babcab8a 100644 --- a/codec/common/cpu_core.h +++ b/codec/common/cpu_core.h @@ -73,6 +73,11 @@ #define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */ #define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */ +/* For the android OS */ +#define WELS_CPU_ARMv7 0x000001 /* ARMv7 */ +#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */ +#define WELS_CPU_NEON 0x000004 /* NEON */ + /* * Interfaces for CPU core feature detection as below */ diff --git a/codec/decoder/core/arm/deblocking_neon.S b/codec/common/deblocking_neon.S similarity index 77% rename from codec/decoder/core/arm/deblocking_neon.S rename to codec/common/deblocking_neon.S index 276f799e..cdb225ab 100755 --- a/codec/decoder/core/arm/deblocking_neon.S +++ b/codec/common/deblocking_neon.S @@ -981,361 +981,21 @@ //eq4_end: WELS_ASM_FUNC_END - -#ifdef APPLE_IOS -//in: $0(const) $1 $2; out:$3 $4 -//used register: r6, r7, q0, q1 -.macro BS_NZC_CHECK - //vld1.8 {d0,d1}, [$0] - vld1.8 {d0,d1}, [$0, :64] - /* Arrenge the input data --- TOP */ - ands r6, $1, #2 - beq bs_nzc_check_jump0 - - sub r6, $0, $2, lsl #4 - sub r6, $2, lsl #3 - add r6, #12 - vld1.32 d3[1], [r6] - -bs_nzc_check_jump0: - vext.8 q1, q1, q0, #12 - vadd.u8 $3, q0, q1 - - - /* Arrenge the input data --- LEFT */ - ands r6, $1, #1 - beq bs_nzc_check_jump1 - - sub r6, $0, #21 - add r7, r6, #4 - vld1.8 d3[4], [r6] - add r6, r7, #4 - vld1.8 d3[5], [r7] - add r7, r6, #4 - vld1.8 d3[6], [r6] - vld1.8 d3[7], [r7] - -bs_nzc_check_jump1: - vzip.8 d0, d1 - vzip.8 d0, d1 - vext.8 q1, q1, q0, #12 - vadd.u8 $4, q0, q1 - -.endm -//in: $0(const) $1 $2; out:$3 $4 -//used register: r6, r7, q0, q1 -.macro BS_REF_INDEX_CHECK - //vld1.8 {d0,d1}, [$0] - vld1.8 {d0,d1}, [$0, :128] - /* Arrenge the input data --- TOP */ - ands r6, $1, #2 - beq bs_ref_index_check_jump0 +// r0 int8_t* non_zero_count, + WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon - sub r6, $0, $2, lsl #4 - add r6, #12 - vld1.32 d3[1], [r6] - -bs_ref_index_check_jump0: - vext.8 q1, q1, q0, #12 - vabd.u8 $3, q0, q1 - - - /* Arrenge the input data --- LEFT */ - ands r6, $1, #1 - beq bs_ref_index_check_jump1 - - sub r6, $0, #13 - add r7, r6, #4 - vld1.8 d3[4], [r6] - add r6, r7, #4 - vld1.8 d3[5], [r7] - add r7, r6, #4 - vld1.8 d3[6], [r6] - vld1.8 d3[7], [r7] - -bs_ref_index_check_jump1: - vzip.8 d0, d1 - vzip.8 d0, d1 - vext.8 q1, q1, q0, #12 - vabd.u8 $4, q0, q1 -.endmacro - -.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6 - mov r6, #4 - vabd.s16 q5, $0, $1 - vabd.s16 q6, $1, $2 - vdup.s16 $0, r6 - vabd.s16 q7, $2, $3 - vabd.s16 q8, $3, $4 - - vcge.s16 q5, $0 - vcge.s16 q6, $0 - vcge.s16 q7, $0 - vcge.s16 q8, $0 - - vpadd.i16 d10, d10, d11 - vpadd.i16 d11, d12, d13 - vpadd.i16 d12, d14, d15 - vpadd.i16 d13, d16, d17 - - vaddhn.i16 $5, q5, q5 - vaddhn.i16 $6, q6, q6 -.endmacro - -//in: $0(const) $1 $2; out:$3 $4 $5 $6 -//used register: r6, r7, q0, q1, q2, q3, q4 -.macro BS_MV_CHECK - //vldm $0, {q0,q1,q2,q3} - vld1.32 {q0,q1}, [$0, :128] - add r6, $0, #32 - vld1.32 {q2,q3}, [r6, :128] - - /* Arrenge the input data --- TOP */ - ands r6, $1, #2 - beq bs_mv_check_jump0 + vld1.64 {d0-d2}, [r0] - sub r6, $0, $2, lsl #6 - add r6, #48 - vld1.8 {d8, d9}, [r6] + vceq.s8 q0, q0, #0 + vceq.s8 d2, d2, #0 + vmvn q0, q0 + vmvn d2, d2 + vabs.s8 q0, q0 + vabs.s8 d2, d2 -bs_mv_check_jump0: - BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4 - - /* Arrenge the input data --- LEFT */ - ands r6, $1, #1 - beq bs_mv_check_jump1 - - sub r6, $0, #52 - //mov r7, #16 - add r7, r6, #16 - vld1.32 d8[0], [r6] - add r6, r7, #16 - vld1.32 d8[1], [r7] - add r7, r6, #16 - vld1.32 d9[0], [r6] - vld1.32 d9[1], [r7] - -bs_mv_check_jump1: - vzip.32 q0, q2 - vzip.32 q1, q3 - vzip.32 q0, q1 - vzip.32 q2, q3 - BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6 -.endmacro -#else -//in: $0(const) $1 $2; out:$3 $4 -//used register: r6, r7, q0, q1 -.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4 - //vld1.8 {d0,d1}, [\arg0] - vld1.8 {d0,d1}, [\arg0, :64] - /* Arrenge the input data --- TOP */ - ands r6, \arg1, #2 - beq bs_nzc_check_jump0 - - sub r6, \arg0, \arg2, lsl #4 - sub r6, \arg2, lsl #3 - add r6, #12 - vld1.32 d3[1], [r6] - -bs_nzc_check_jump0: - vext.8 q1, q1, q0, #12 - vadd.u8 \arg3, q0, q1 + vst1.64 {d0-d2}, [r0] + WELS_ASM_FUNC_END - - /* Arrenge the input data --- LEFT */ - ands r6, \arg1, #1 - beq bs_nzc_check_jump1 - - sub r6, \arg0, #21 - add r7, r6, #4 - vld1.8 d3[4], [r6] - add r6, r7, #4 - vld1.8 d3[5], [r7] - add r7, r6, #4 - vld1.8 d3[6], [r6] - vld1.8 d3[7], [r7] - -bs_nzc_check_jump1: - vzip.8 d0, d1 - vzip.8 d0, d1 - vext.8 q1, q1, q0, #12 - vadd.u8 \arg4, q0, q1 - -.endm - - -//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4 -//used register: r6, r7, q0, q1 -.macro BS_REF_INDEX_CHECK arg0, arg1, arg2, arg3, arg4 - //vld1.8 {d0,d1}, [\arg0] - vld1.8 {d0,d1}, [\arg0, :128] - /* Arrenge the input data --- TOP */ - ands r6, \arg1, #2 - beq bs_ref_index_check_jump0 - - sub r6, \arg0, \arg2, lsl #4 - add r6, #12 - vld1.32 d3[1], [r6] - -bs_ref_index_check_jump0: - vext.8 q1, q1, q0, #12 - vabd.u8 \arg3, q0, q1 - - - /* Arrenge the input data --- LEFT */ - ands r6, \arg1, #1 - beq bs_ref_index_check_jump1 - - sub r6, \arg0, #13 - add r7, r6, #4 - vld1.8 d3[4], [r6] - add r6, r7, #4 - vld1.8 d3[5], [r7] - add r7, r6, #4 - vld1.8 d3[6], [r6] - vld1.8 d3[7], [r7] - -bs_ref_index_check_jump1: - vzip.8 d0, d1 - vzip.8 d0, d1 - vext.8 q1, q1, q0, #12 - vabd.u8 \arg4, q0, q1 -.endm - -//in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5, \arg6 -.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 - - mov r6, #4 - vabd.s16 q5, \arg0, \arg1 - vabd.s16 q6, \arg1, \arg2 - vdup.s16 \arg0, r6 - vabd.s16 q7, \arg2, \arg3 - vabd.s16 q8, \arg3, \arg4 - - vcge.s16 q5, \arg0 - vcge.s16 q6, \arg0 - vcge.s16 q7, \arg0 - vcge.s16 q8, \arg0 - - vpadd.i16 d10, d10, d11 - vpadd.i16 d11, d12, d13 - vpadd.i16 d12, d14, d15 - vpadd.i16 d13, d16, d17 - - vaddhn.i16 \arg5, q5, q5 - vaddhn.i16 \arg6, q6, q6 -.endm - -//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4 \arg5 \arg6 -//used register: r6, r7, q0, q1, q2, q3, q4 -.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6 - //vldm \arg0, {q0,q1,q2,q3} - vld1.32 {q0,q1}, [\arg0, :128] - add r6, \arg0, #32 - vld1.32 {q2,q3}, [r6, :128] - - /* Arrenge the input data --- TOP */ - ands r6, \arg1, #2 - beq bs_mv_check_jump0 - - sub r6, \arg0, \arg2, lsl #6 - add r6, #48 - vld1.8 {d8, d9}, [r6] - -bs_mv_check_jump0: - BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4 - - /* Arrenge the input data --- LEFT */ - ands r6, \arg1, #1 - beq bs_mv_check_jump1 - - sub r6, \arg0, #52 - //mov r7, #16 - add r7, r6, #16 - vld1.32 d8[0], [r6] - add r6, r7, #16 - vld1.32 d8[1], [r7] - add r7, r6, #16 - vld1.32 d9[0], [r6] - vld1.32 d9[1], [r7] - -bs_mv_check_jump1: - vzip.32 q0, q2 - vzip.32 q1, q3 - vzip.32 q0, q1 - vzip.32 q2, q3 - BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6 -.endm -#endif -/* - * void deblocking_BS_calc_neon(int8_t *pNzc, - * int8_t *pRef_index, - * int16_t *pMv[], - * int32_t boundry_flag, - * int32_t mb_width, - * uint8_t *bS); - * - * r0 = cur_layer->nzc[cur_mb_xy] - * r1 = cur_layer->ref_index[0][cur_mb_xy] - * r2 = cur_layer->mv[0][cur_mb_xy] - * r3 = boundry_flag (LEFT_FLAG/TOP_FLAG) - * r4 = cur_layer->mb_width - * r5 = BS[8][4] save all of the BS value for whole MB(16*16) - */ - - WELS_ASM_FUNC_BEGIN deblocking_BS_calc_neon - - stmdb sp!, {r4-r7} - - ldr r4, [sp, #16] //Save mb_width to r4 - ldr r5, [sp, #20] //Save BS to r5 - - /* Checking the nzc status */ - BS_NZC_CHECK r0, r3, r4, q14, q15 //q14,q15 save the nzc status - - /* Checking the nzc_rs status */ - //BS_NZC_CHECK r1, r4, q12, q13 //q12,q13 save the mzc_rs status - - /* For checking bS[I] = 2 */ - mov r6, #2 - //vqadd.u8 q14, q12 - //vqadd.u8 q15, q13 - vcgt.s8 q14, q14, #0 - vdup.u8 q0, r6 - vcgt.s8 q15, q15, #0 - - vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top - vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left - - - /* Checking the ref_index status*/ - BS_REF_INDEX_CHECK r1, r3, r4, q12, q13 //q12,q13 save the ref_index status - - vcgt.s8 q12, q12, #0 - vcgt.s8 q13, q13, #0 - - /* Checking the mv status*/ - BS_MV_CHECK r2, r3, r4, d20, d21, d22, d23//q10, q11 save the mv status - - /* For checking bS[I] = 1 */ - mov r6, #1 - vqadd.u8 q12, q10 - vdup.u8 q0, r6 - vqadd.u8 q13, q11 - - vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top - vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left - - - /* Check bS[I] is '1' or '2' */ - vmax.u8 q1, q12, q14 - vmax.u8 q0, q13, q15 - - //vstm r5, {q0, q1} - vst1.32 {q0, q1}, [r5] - ldmia sp!, {r4-r7} - WELS_ASM_FUNC_END -/*====== deblocking_BS_calc_neon End ======*/ #endif diff --git a/codec/decoder/core/arm/block_add_neon.S b/codec/decoder/core/arm/block_add_neon.S index 5327ae5e..94a4713b 100755 --- a/codec/decoder/core/arm/block_add_neon.S +++ b/codec/decoder/core/arm/block_add_neon.S @@ -34,29 +34,6 @@ .text #include "arm_arch_common_macro.S" #ifdef APPLE_IOS -.macro ORR_32BYTES_TO_8BYTES -// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1 - vorr.s16 $0, $1 - vorr.s16 $2, $3 - vorr.s16 $8, $4, $5 - vorr.s16 $9, $6, $7 -// } -.endm - -.macro ADD_PRED_1BYTE_TO_RESID_2BYTES -// { // input: q0~q3, d0~d3, output: d0~d3; - - vaddw.u8 $0, $4 - vaddw.u8 $1, $5 - vaddw.u8 $2, $6 - vaddw.u8 $3, $7 - - vqmovun.s16 $4, $0 //saturation - vqmovun.s16 $6, $2 - vqmovun.s16 $5, $1 - vqmovun.s16 $7, $3 -// } -.endm .macro ROW_TRANSFORM_1_STEP // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 @@ -89,39 +66,7 @@ // } .endm -.macro ADD_AND_CLIP_RS -// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q; - vrshrn.s32 $5, $0, #6 - vrshrn.s32 $6, $1, #6 - vqadd.s16 $7, $4 - vmin.s16 $7, $7, $2 - vmax.s16 $7, $7, $3 -// } -.endm #else -.macro ORR_32BYTES_TO_8BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1 - vorr.s16 \arg0, \arg1 - vorr.s16 \arg2, \arg3 - vorr.s16 \arg8, \arg4, \arg5 - vorr.s16 \arg9, \arg6, \arg7 -// } -.endm - -.macro ADD_PRED_1BYTE_TO_RESID_2BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: q0~q3, d0~d3, output: d0~d3; - - vaddw.u8 \arg0, \arg4 - vaddw.u8 \arg1, \arg5 - vaddw.u8 \arg2, \arg6 - vaddw.u8 \arg3, \arg7 - - vqmovun.s16 \arg4, \arg0 //saturation - vqmovun.s16 \arg6, \arg2 - vqmovun.s16 \arg5, \arg1 - vqmovun.s16 \arg7, \arg3 -// } -.endm .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 @@ -153,16 +98,6 @@ vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); // } .endm - -.macro ADD_AND_CLIP_RS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q; - vrshrn.s32 \arg5, \arg0, #6 - vrshrn.s32 \arg6, \arg1, #6 - vqadd.s16 \arg7, \arg4 - vmin.s16 \arg7, \arg7, \arg2 - vmax.s16 \arg7, \arg7, \arg3 -// } -.endm #endif // r0 int16_t* block, // r1 int8_t* non_zero_count, @@ -180,157 +115,6 @@ vst1.64 {d0-d2}, [r1] WELS_ASM_FUNC_END -// r0 int16_t* block, -// r1 int8_t* non_zero_count, - WELS_ASM_FUNC_BEGIN svc_non_zero_count_neon - push {r2-r4} - mov r4, #3 - mov r3, #64 - add r2, r0, #32 - pld [r0, #512] -non_zero_count_two_8x8_loop: - - vld1.64 {q0, q1}, [r0,:128], r3 - vld1.64 {q2, q3}, [r2,:128], r3 - vld1.64 {q4, q5}, [r0,:128], r3 - vld1.64 {q6, q7}, [r2,:128], r3 - vld1.64 {q8, q9}, [r0,:128], r3 - vld1.64 {q10, q11}, [r2,:128], r3//load #0 8x8 block resi data, - vld1.64 {q12, q13}, [r0,:128], r3 - vld1.64 {q14, q15}, [r2,:128], r3//load #1 8x8 block resi data, - pld [r0, #512] - - ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, d0, d1, d4, d5, d2, d3 // output q1 -// vceq.i16 q1, q1, #0 - - ORR_32BYTES_TO_8BYTES q8, q9,q10,q11,d16,d17,d20,d21,d4,d5 // output q2 -// vceq.i16 q2, q2, #0 - - ORR_32BYTES_TO_8BYTES q4, q5, q6, q7, d8, d9, d12, d13, d10, d11 // output q5 -// vceq.i16 q5, q5, #0 - - ORR_32BYTES_TO_8BYTES q12,q13,q14,q15,d24,d25, d28, d29, d12, d13 // output q6 -// vceq.i16 q6, q6, #0 - - vqmovn.u64 d0, q1 // 8bytes-->4bytes - vqmovn.u64 d8, q5 - vqmovn.u64 d1, q2 - vqmovn.u64 d9, q6 - - vqmovn.u32 d2, q0 // 4bytes-->2bytes - vqmovn.u32 d3, q4 - - vceq.i16 q0, q1, #0 - vmvn q0, q0 - vabs.s16 q2, q0 - vmovn.u16 d6, q2 // 2bytes-->1bytes - vst1.u8 {d6}, [r1]! - -// pld [r0] - subs r4, r4, #1 - bne non_zero_count_two_8x8_loop - - pop {r2-r4} - WELS_ASM_FUNC_END - -// r0 int16_t* block, -// r1 int8_t* non_zero_count, - WELS_ASM_FUNC_BEGIN svc_rs_non_zero_count_neon - - vld1.i16 {q0, q1}, [r0]! // block is unaligned!!! - vld1.i16 {q2, q3}, [r0]! - vld1.i16 {q4, q5}, [r0]! - vld1.i16 {q6, q7}, [r0]! - - vld1.i16 {q8, q9}, [r0]! - vld1.i16 {q10, q11}, [r0]! - vld1.i16 {q12, q13}, [r0]! - vld1.i16 {q14, q15}, [r0]! - - ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5 - vorr.s16 q0, q4 - vorr.s16 q1, q5 // output d0~d3 - ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13 - vorr.s16 q6, q8, q12 - vorr.s16 q7, q9, q13 // output d12~d15 - - vqmovn.u64 d4, q0 // 8bytes-->4bytes - vqmovn.u64 d6, q6 - vqmovn.u64 d5, q1 - vqmovn.u64 d7, q7 - - vqmovn.u32 d8, q2 // 4bytes-->2bytes - vqmovn.u32 d9, q3 - - vceq.i16 q5, q4, #0 - vmvn q5, q5 - vabs.s16 q5, q5 - vmovn.u16 d10, q5 // 2bytes-->1bytes - vst1.u8 {d10}, [r1]! - - vld1.i16 {q0, q1}, [r0]! - vld1.i16 {q2, q3}, [r0]! - vld1.i16 {q4, q5}, [r0]! - vld1.i16 {q6, q7}, [r0]! - - vld1.i16 {q8, q9}, [r0]! - vld1.i16 {q10, q11}, [r0]! - vld1.i16 {q12, q13}, [r0]! - vld1.i16 {q14, q15}, [r0]! - - ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5 - vorr.s16 q0, q4 - vorr.s16 q1, q5 // output d0~d3 - ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13 - vorr.s16 q6, q8, q12 - vorr.s16 q7, q9, q13 // output d12~d15 - - vqmovn.u64 d4, q0 // 8bytes-->4bytes - vqmovn.u64 d6, q6 - vqmovn.u64 d5, q1 - vqmovn.u64 d7, q7 - - vqmovn.u32 d8, q2 // 4bytes-->2bytes - vqmovn.u32 d9, q3 - - vceq.i16 q5, q4, #0 - vmvn q5, q5 - vabs.s16 q5, q5 - vmovn.u16 d10, q5 // 2bytes-->1bytes - vst1.u8 {d10}, [r1]! - -// Chroma - vld1.i16 {q0, q1}, [r0]! - vld1.i16 {q2, q3}, [r0]! - vld1.i16 {q4, q5}, [r0]! - vld1.i16 {q6, q7}, [r0]! //load Cb block, - - vld1.i16 {q8, q9}, [r0]! - vld1.i16 {q10, q11}, [r0]! - vld1.i16 {q12, q13}, [r0]! - vld1.i16 {q14, q15}, [r0]! //load Cr block, - - ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, q4, q5, q6, q7, q4, q6 - vorr.s16 q0, q2 - vorr.s16 q1, q4, q6 // output d0~d3 - ORR_32BYTES_TO_8BYTES q8, q9, q10, q11, q12, q13, q14, q15, q12, q14 - vorr.s16 q2, q8, q10 - vorr.s16 q3, q12, q14 // output d4~d7 - - vqmovn.u64 d8, q0 // 8bytes-->4bytes - vqmovn.u64 d10, q2 - vqmovn.u64 d9, q1 - vqmovn.u64 d11, q3 - - vqmovn.u32 d12, q4 // 4bytes-->2bytes - vqmovn.u32 d13, q5 - - vceq.i16 q7, q6, #0 - vmvn q7, q7 - vabs.s16 q7, q7 - vmovn.u16 d10, q7 // 2bytes-->1bytes - vst1.u8 {d10}, [r1]! - WELS_ASM_FUNC_END // r0 int16_t * block, // r1 int32_t stride @@ -371,207 +155,6 @@ block_zero_8x8_chma_loop: pop {r2} WELS_ASM_FUNC_END -// r0 int8_t* dst_addr, -// r1 memset_value -// r2 int32_t bytes_nmb, - - WELS_ASM_FUNC_BEGIN svc_block_memset_neon// dst should continue - vdup.u8 q0, r1 - vdup.u8 q1, r1 - -block_memset_loop: - vst1.64 {q0, q1}, [r0,:64]! - subs r2, r2, #64 - vst1.64 {q0, q1}, [r0,:64]! - bne block_memset_loop - WELS_ASM_FUNC_END - -// int16_t* dst, -// int16_t* src, -// int32_t stride - WELS_ASM_FUNC_BEGIN svc_block_copy_16x16_neon - push {r3} - mov r3, #16 -// each element is sizeof(int16_t) - lsl r2, r2, #1 // r2 = 2*r2 - -block_copy_16x16_luma_loop: - vld1.i16 {q0, q1}, [r1], r2 - subs r3, r3, #1 - vst1.i16 {q0, q1}, [r0]! - bne block_copy_16x16_luma_loop - - pop {r3} - WELS_ASM_FUNC_END - - WELS_ASM_FUNC_BEGIN svc_block_copy_8x8_neon - push {r3} - mov r3, #8 -// each element is sizeof(int16_t) - lsl r2, r2, #1 // r2 = 2*r2 - -block_copy_8x8_chma_loop: - vld1.i16 {q0}, [r1], r2 - subs r3, r3, #1 - vst1.i16 {q0}, [r0]! - bne block_copy_8x8_chma_loop - - pop {r3} - WELS_ASM_FUNC_END - -// r0 uint8_t * dest, -// r1 uint8_t * pred, -// r2 int16_t * res, -// r3 int32_t stride, - WELS_ASM_FUNC_BEGIN svc_block_add_16x16_neon - push {r4} - mov r4, #16 - pld [r1] -block_recon_16x16_luma_loop: - - vld1.64 {d16,d17}, [r1,:64], r3 //load 16 pred data, update addr - vld1.s16 {q0, q1}, [r2]! //load 8+8 resi data, update addr - vld1.64 {d18,d19}, [r1,:64], r3 - vld1.s16 {q2, q3}, [r2]! - ADD_PRED_1BYTE_TO_RESID_2BYTES q0, q1, q2, q3, d16, d17, d18, d19 - pld [r1] - vst1.64 {q8}, [r0], r3 //store result - vst1.64 {q9}, [r0], r3 -//#ifdef DEBUG_NEON -// vst1.u8 {q8}, [r0]! -// vst1.u8 {q9}, [r0]! -//#endif - - vld1.64 {d20,d21}, [r1,:64], r3 //load 16 pred data, update addr - vld1.s16 {q4, q5}, [r2]! //load 8+8 resi data, update addr - vld1.64 {d22,d23}, [r1,:64], r3 - vld1.s16 {q6, q7}, [r2]! - ADD_PRED_1BYTE_TO_RESID_2BYTES q4, q5, q6, q7, d20, d21, d22, d23 - pld [r1] - vst1.64 {q10}, [r0], r3 - vst1.64 {q11}, [r0], r3 -//#ifdef DEBUG_NEON -// vst1.u8 {q10}, [r0]! -// vst1.u8 {q11}, [r0]! -//#endif - - subs r4, r4, #4 - bne block_recon_16x16_luma_loop - - pop {r4} - WELS_ASM_FUNC_END - - - WELS_ASM_FUNC_BEGIN svc_block_add_8x8_neon - - vld1.u8 {d24}, [r1], r3 //load 8 pred data - vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr - vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12 - vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr - vld1.u8 {d26}, [r1], r3 //load 8 pred data - vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13 - - ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27 - pld [r1] - vst1.u8 {d24}, [r0], r3 //store result - vst1.u8 {d25}, [r0], r3 //store result - vst1.u8 {d26}, [r0], r3 //store result - vst1.u8 {d27}, [r0], r3 //store result -//#ifdef DEBUG_NEON -// vst1.u8 {d24}, [r0]! -//#endif - - vld1.u8 {d24}, [r1], r3 //load 8 pred data - vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr - vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12 - vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr - vld1.u8 {d26}, [r1], r3 //load 8 pred data - vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13 - - ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27 - vst1.u8 {d24}, [r0], r3 //store result - vst1.u8 {d25}, [r0], r3 //store result - vst1.u8 {d26}, [r0], r3 //store result - vst1.u8 {d27}, [r0], r3 //store result -//#ifdef DEBUG_NEON -// vst1.u8 {d24}, [r0]! -//#endif - WELS_ASM_FUNC_END - - -// int16_t* dst, -// int16_t* src, -// int stride - WELS_ASM_FUNC_BEGIN svc_simple_idct4x4_neon - - vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles! - lsl r2, r2, #1 - - ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5 - - TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7 - - // transform element 32bits - vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] - - COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7 - - TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7 - - vrshrn.s32 d0, q0, #6 - vst1.s16 {d0}, [r0], r2 //store - vrshrn.s32 d1, q1, #6 - vst1.s16 {d1}, [r0], r2 //store - vrshrn.s32 d2, q2, #6 - vst1.s16 {d2}, [r0], r2 //store - vrshrn.s32 d3, q3, #6 - vst1.s16 {d3}, [r0], r2 //store - - WELS_ASM_FUNC_END -// int16_t* dst, -// int16_t* src, -// int stride - WELS_ASM_FUNC_BEGIN svc_idct4x4_add_neon - - vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles! - lsl r2, r2, #1 - - ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5 - - TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7 - - // transform element 32bits - vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] - - COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7 - - TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7 - - //see draft G.8.5.3 , after clip_rs() into [-255, 255] - vmov.i16 q10,#0xFF - veor q11, q11 - vsub.i16 q11, q11,q10 -// vmvn.i16 q11,#0xFF - - mov r1, r0 - vld1.s16 {d16}, [r0], r2 - vld1.s16 {d17}, [r0], r2 - ADD_AND_CLIP_RS q0, q1, q10, q11, q8, d8, d9, q4 - vst1.s16 {d8}, [r1], r2 //store - vst1.s16 {d9}, [r1], r2 //store - - vld1.s16 {d18}, [r0], r2 - vld1.s16 {d19}, [r0], r2 - ADD_AND_CLIP_RS q2, q3, q10, q11, q9, d10, d11, q5 - vst1.s16 {d10}, [r1], r2 //store - vst1.s16 {d11}, [r1], r2 //store - WELS_ASM_FUNC_END // uint8_t *pred, const int32_t stride, int16_t *rs WELS_ASM_FUNC_BEGIN IdctResAddPred_neon diff --git a/codec/decoder/core/src/deblocking.cpp b/codec/decoder/core/src/deblocking.cpp index 772ff48e..80ce7484 100644 --- a/codec/decoder/core/src/deblocking.cpp +++ b/codec/decoder/core/src/deblocking.cpp @@ -720,6 +720,7 @@ void DeblockingInit (SDeblockingFunc* pFunc, int32_t iCpu) { #endif #if defined(HAVE_NEON) + if ( iCpu & WELS_CPU_NEON ) { pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon; pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon; diff --git a/codec/decoder/core/src/decode_slice.cpp b/codec/decoder/core/src/decode_slice.cpp index e8a6f4f4..f94c85f4 100644 --- a/codec/decoder/core/src/decode_slice.cpp +++ b/codec/decoder/core/src/decode_slice.cpp @@ -1150,9 +1150,11 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) { #endif #ifdef HAVE_NEON - pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon; - pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon; - pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon; + if ( iCpu & WELS_CPU_NEON ) { + pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon; + pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon; + pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon; + } #endif } void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) { diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp index 2d183e5e..96d469d0 100644 --- a/codec/decoder/core/src/decoder.cpp +++ b/codec/decoder/core/src/decoder.cpp @@ -146,7 +146,14 @@ void WelsDecoderDefaults (PWelsDecoderContext pCtx) { #if defined(X86_ASM) pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); -#endif//X86_ASM +#elif defined(HAVE_NEON) +#if defined(ANDROID_NDK) + pCtx->uiCpuFlag = WelsCPUFeatureDetectAndroid(); +#endif +#if defined(APPLE_IOS) + pCtx->uiCpuFlag = WelsCPUFeatureDetectIOS(); +#endif +#endif pCtx->iImgWidthInPixel = 0; pCtx->iImgHeightInPixel = 0; // alloc picture data when picture size is available @@ -657,26 +664,28 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) { pCtx->pIdctResAddPredFunc = IdctResAddPred_c; #if defined(HAVE_NEON) - pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; + if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) { + pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; - pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon; - pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon; - pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon; - pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon; - pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon; - pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon; - pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon; - pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon; - pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon; + pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon; + pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon; + pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon; + pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon; + } #endif//HAVE_NEON diff --git a/codec/decoder/core/src/mc.cpp b/codec/decoder/core/src/mc.cpp index ae2be16d..fa840512 100644 --- a/codec/decoder/core/src/mc.cpp +++ b/codec/decoder/core/src/mc.cpp @@ -971,8 +971,10 @@ void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) { pMcFunc->pMcChromaFunc = McChroma_c; #ifdef HAVE_NEON - pMcFunc->pMcLumaFunc = McLuma_neon; - pMcFunc->pMcChromaFunc = McChroma_neon; + if ( iCpu & WELS_CPU_NEON ) { + pMcFunc->pMcLumaFunc = McLuma_neon; + pMcFunc->pMcChromaFunc = McChroma_neon; + } #endif #if defined (X86_ASM)