Remove deblocking arm asm code to common folder, add cpu detect for arm, clean some code.

This commit is contained in:
Licai Guo
2014-02-28 17:08:24 +08:00
parent 0fd9db2878
commit b7a25df13f
12 changed files with 126 additions and 800 deletions

View File

@@ -19,6 +19,8 @@
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473818BC61650017DF25 /* deblocking_common.cpp */; }; 4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473818BC61650017DF25 /* deblocking_common.cpp */; };
4CE4475218BC61650017DF25 /* logging.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473C18BC61650017DF25 /* logging.cpp */; }; 4CE4475218BC61650017DF25 /* logging.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473C18BC61650017DF25 /* logging.cpp */; };
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */; }; 4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */; };
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BC18C085320017DF25 /* deblocking_neon.S */; };
4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */ /* Begin PBXContainerItemProxy section */
@@ -69,6 +71,8 @@
4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; }; 4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; }; 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; }; 4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
@@ -144,6 +148,8 @@
4CE4472F18BC61650017DF25 /* common */ = { 4CE4472F18BC61650017DF25 /* common */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
4CE4473118BC61650017DF25 /* cpu.cpp */, 4CE4473118BC61650017DF25 /* cpu.cpp */,
4CE4473218BC61650017DF25 /* cpu.h */, 4CE4473218BC61650017DF25 /* cpu.h */,
4CE4473318BC61650017DF25 /* cpu_core.h */, 4CE4473318BC61650017DF25 /* cpu_core.h */,
@@ -247,9 +253,11 @@
isa = PBXSourcesBuildPhase; isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */, 4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */, 4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
4CE4475218BC61650017DF25 /* logging.cpp in Sources */, 4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */, 4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */, 4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
); );

View File

@@ -36,9 +36,7 @@
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; }; 4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; };
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; }; 4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; };
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; }; 4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */; };
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; }; 4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */; };
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; }; 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; }; 4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
@@ -132,9 +130,7 @@
4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; }; 4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; };
4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; }; 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; };
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; }; 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; }; 4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
4CE447A818BC6BE90017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; }; 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; }; 4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
@@ -327,9 +323,7 @@
4CE447A518BC6BE90017DF25 /* arm */ = { 4CE447A518BC6BE90017DF25 /* arm */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */,
4CE447A718BC6BE90017DF25 /* block_add_neon.S */, 4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
4CE447A818BC6BE90017DF25 /* deblocking_neon.S */,
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */, 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
4CE447AA18BC6BE90017DF25 /* mc_neon.S */, 4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
); );
@@ -424,7 +418,6 @@
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */, 4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */,
4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */, 4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */,
4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */, 4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */,
4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */, 4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */,
4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */, 4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */, 4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */,
@@ -435,7 +428,6 @@
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */, 4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */,
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */, 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */, 4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */,
4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */, 4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */, 4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,
4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */, 4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */,

View File

@@ -38,7 +38,12 @@
************************************************************************************* *************************************************************************************
*/ */
#include <string.h> #include <string.h>
#ifdef ANDROID_NDK
#include <cpu-features.h>
#endif
#ifdef APPLE_IOS
#include <sys/utsname.h>
#endif
#include "cpu.h" #include "cpu.h"
#include "cpu_core.h" #include "cpu_core.h"
@@ -209,4 +214,53 @@ void WelsXmmRegEmptyOp(void * pSrc) {
#endif #endif
#if defined(HAVE_NEON)//For supporting both android platform and iOS platform
#if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetectAndroid()
{
uint32_t uiCPU = 0;
AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
uint64_t uiFeatures = 0;
cpuFamily = android_getCpuFamily();
if (cpuFamily == ANDROID_CPU_FAMILY_ARM)
{
uiFeatures = android_getCpuFeatures();
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
uiCPU |= WELS_CPU_ARMv7;
}
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
uiCPU |= WELS_CPU_VFPv3;
}
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
uiCPU |= WELS_CPU_NEON;
}
}
return uiCPU;
}
#endif
#if defined(APPLE_IOS)
uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of APPLE
{
uint32_t uiCPU = 0;
struct utsname sSystemInfo;
uname (&sSystemInfo);
if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
(0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G
(0 != strcmp(sSystemInfo.machine, "iPod1,1")) && //iPod 1G
(0 != strcmp(sSystemInfo.machine, "iPod2,1"))) //iPod 2G
{
uiCPU |= WELS_CPU_ARMv7;
uiCPU |= WELS_CPU_VFPv3;
uiCPU |= WELS_CPU_NEON;
}
return uiCPU;
}
#endif
#endif

View File

@@ -78,6 +78,16 @@ void WelsXmmRegLoad(void * src);
void WelsXmmRegEmptyOp(void * pSrc); void WelsXmmRegEmptyOp(void * pSrc);
#if defined(HAVE_NEON)
#if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetectAndroid();
#endif
#if defined(APPLE_IOS)
uint32_t WelsCPUFeatureDetectIOS();
#endif
#endif
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif//__cplusplus #endif//__cplusplus

View File

@@ -73,6 +73,11 @@
#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */ #define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */ #define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
/* For the android OS */
#define WELS_CPU_ARMv7 0x000001 /* ARMv7 */
#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
#define WELS_CPU_NEON 0x000004 /* NEON */
/* /*
* Interfaces for CPU core feature detection as below * Interfaces for CPU core feature detection as below
*/ */

View File

@@ -982,360 +982,20 @@
//eq4_end: //eq4_end:
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#ifdef APPLE_IOS
//in: $0(const) $1 $2; out:$3 $4
//used register: r6, r7, q0, q1
.macro BS_NZC_CHECK
//vld1.8 {d0,d1}, [$0]
vld1.8 {d0,d1}, [$0, :64]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4 // r0 int8_t* non_zero_count,
sub r6, $2, lsl #3 WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0: vld1.64 {d0-d2}, [r0]
vext.8 q1, q1, q0, #12
vadd.u8 $3, q0, q1
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
/* Arrenge the input data --- LEFT */ vst1.64 {d0-d2}, [r0]
ands r6, $1, #1
beq bs_nzc_check_jump1
sub r6, $0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 $4, q0, q1
.endm
//in: $0(const) $1 $2; out:$3 $4
//used register: r6, r7, q0, q1
.macro BS_REF_INDEX_CHECK
//vld1.8 {d0,d1}, [$0]
vld1.8 {d0,d1}, [$0, :128]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_ref_index_check_jump0
sub r6, $0, $2, lsl #4
add r6, #12
vld1.32 d3[1], [r6]
bs_ref_index_check_jump0:
vext.8 q1, q1, q0, #12
vabd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_ref_index_check_jump1
sub r6, $0, #13
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_ref_index_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vabd.u8 $4, q0, q1
.endmacro
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
mov r6, #4
vabd.s16 q5, $0, $1
vabd.s16 q6, $1, $2
vdup.s16 $0, r6
vabd.s16 q7, $2, $3
vabd.s16 q8, $3, $4
vcge.s16 q5, $0
vcge.s16 q6, $0
vcge.s16 q7, $0
vcge.s16 q8, $0
vpadd.i16 d10, d10, d11
vpadd.i16 d11, d12, d13
vpadd.i16 d12, d14, d15
vpadd.i16 d13, d16, d17
vaddhn.i16 $5, q5, q5
vaddhn.i16 $6, q6, q6
.endmacro
//in: $0(const) $1 $2; out:$3 $4 $5 $6
//used register: r6, r7, q0, q1, q2, q3, q4
.macro BS_MV_CHECK
//vldm $0, {q0,q1,q2,q3}
vld1.32 {q0,q1}, [$0, :128]
add r6, $0, #32
vld1.32 {q2,q3}, [r6, :128]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_mv_check_jump1
sub r6, $0, #52
//mov r7, #16
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endmacro
#else
//in: $0(const) $1 $2; out:$3 $4
//used register: r6, r7, q0, q1
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
//vld1.8 {d0,d1}, [\arg0]
vld1.8 {d0,d1}, [\arg0, :64]
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_nzc_check_jump0
sub r6, \arg0, \arg2, lsl #4
sub r6, \arg2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 \arg3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_nzc_check_jump1
sub r6, \arg0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 \arg4, q0, q1
.endm
//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4
//used register: r6, r7, q0, q1
.macro BS_REF_INDEX_CHECK arg0, arg1, arg2, arg3, arg4
//vld1.8 {d0,d1}, [\arg0]
vld1.8 {d0,d1}, [\arg0, :128]
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_ref_index_check_jump0
sub r6, \arg0, \arg2, lsl #4
add r6, #12
vld1.32 d3[1], [r6]
bs_ref_index_check_jump0:
vext.8 q1, q1, q0, #12
vabd.u8 \arg3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_ref_index_check_jump1
sub r6, \arg0, #13
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_ref_index_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vabd.u8 \arg4, q0, q1
.endm
//in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5, \arg6
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6
mov r6, #4
vabd.s16 q5, \arg0, \arg1
vabd.s16 q6, \arg1, \arg2
vdup.s16 \arg0, r6
vabd.s16 q7, \arg2, \arg3
vabd.s16 q8, \arg3, \arg4
vcge.s16 q5, \arg0
vcge.s16 q6, \arg0
vcge.s16 q7, \arg0
vcge.s16 q8, \arg0
vpadd.i16 d10, d10, d11
vpadd.i16 d11, d12, d13
vpadd.i16 d12, d14, d15
vpadd.i16 d13, d16, d17
vaddhn.i16 \arg5, q5, q5
vaddhn.i16 \arg6, q6, q6
.endm
//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4 \arg5 \arg6
//used register: r6, r7, q0, q1, q2, q3, q4
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
//vldm \arg0, {q0,q1,q2,q3}
vld1.32 {q0,q1}, [\arg0, :128]
add r6, \arg0, #32
vld1.32 {q2,q3}, [r6, :128]
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_mv_check_jump0
sub r6, \arg0, \arg2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_mv_check_jump1
sub r6, \arg0, #52
//mov r7, #16
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
.endm
#endif
/*
* void deblocking_BS_calc_neon(int8_t *pNzc,
* int8_t *pRef_index,
* int16_t *pMv[],
* int32_t boundry_flag,
* int32_t mb_width,
* uint8_t *bS);
*
* r0 = cur_layer->nzc[cur_mb_xy]
* r1 = cur_layer->ref_index[0][cur_mb_xy]
* r2 = cur_layer->mv[0][cur_mb_xy]
* r3 = boundry_flag (LEFT_FLAG/TOP_FLAG)
* r4 = cur_layer->mb_width
* r5 = BS[8][4] save all of the BS value for whole MB(16*16)
*/
WELS_ASM_FUNC_BEGIN deblocking_BS_calc_neon
stmdb sp!, {r4-r7}
ldr r4, [sp, #16] //Save mb_width to r4
ldr r5, [sp, #20] //Save BS to r5
/* Checking the nzc status */
BS_NZC_CHECK r0, r3, r4, q14, q15 //q14,q15 save the nzc status
/* Checking the nzc_rs status */
//BS_NZC_CHECK r1, r4, q12, q13 //q12,q13 save the mzc_rs status
/* For checking bS[I] = 2 */
mov r6, #2
//vqadd.u8 q14, q12
//vqadd.u8 q15, q13
vcgt.s8 q14, q14, #0
vdup.u8 q0, r6
vcgt.s8 q15, q15, #0
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
/* Checking the ref_index status*/
BS_REF_INDEX_CHECK r1, r3, r4, q12, q13 //q12,q13 save the ref_index status
vcgt.s8 q12, q12, #0
vcgt.s8 q13, q13, #0
/* Checking the mv status*/
BS_MV_CHECK r2, r3, r4, d20, d21, d22, d23//q10, q11 save the mv status
/* For checking bS[I] = 1 */
mov r6, #1
vqadd.u8 q12, q10
vdup.u8 q0, r6
vqadd.u8 q13, q11
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
/* Check bS[I] is '1' or '2' */
vmax.u8 q1, q12, q14
vmax.u8 q0, q13, q15
//vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
ldmia sp!, {r4-r7}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*====== deblocking_BS_calc_neon End ======*/
#endif #endif

View File

@@ -34,29 +34,6 @@
.text .text
#include "arm_arch_common_macro.S" #include "arm_arch_common_macro.S"
#ifdef APPLE_IOS #ifdef APPLE_IOS
.macro ORR_32BYTES_TO_8BYTES
// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
vorr.s16 $0, $1
vorr.s16 $2, $3
vorr.s16 $8, $4, $5
vorr.s16 $9, $6, $7
// }
.endm
.macro ADD_PRED_1BYTE_TO_RESID_2BYTES
// { // input: q0~q3, d0~d3, output: d0~d3;
vaddw.u8 $0, $4
vaddw.u8 $1, $5
vaddw.u8 $2, $6
vaddw.u8 $3, $7
vqmovun.s16 $4, $0 //saturation
vqmovun.s16 $6, $2
vqmovun.s16 $5, $1
vqmovun.s16 $7, $3
// }
.endm
.macro ROW_TRANSFORM_1_STEP .macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
@@ -89,39 +66,7 @@
// } // }
.endm .endm
.macro ADD_AND_CLIP_RS
// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;
vrshrn.s32 $5, $0, #6
vrshrn.s32 $6, $1, #6
vqadd.s16 $7, $4
vmin.s16 $7, $7, $2
vmax.s16 $7, $7, $3
// }
.endm
#else #else
.macro ORR_32BYTES_TO_8BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
vorr.s16 \arg0, \arg1
vorr.s16 \arg2, \arg3
vorr.s16 \arg8, \arg4, \arg5
vorr.s16 \arg9, \arg6, \arg7
// }
.endm
.macro ADD_PRED_1BYTE_TO_RESID_2BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: q0~q3, d0~d3, output: d0~d3;
vaddw.u8 \arg0, \arg4
vaddw.u8 \arg1, \arg5
vaddw.u8 \arg2, \arg6
vaddw.u8 \arg3, \arg7
vqmovun.s16 \arg4, \arg0 //saturation
vqmovun.s16 \arg6, \arg2
vqmovun.s16 \arg5, \arg1
vqmovun.s16 \arg7, \arg3
// }
.endm
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
@@ -153,16 +98,6 @@
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// } // }
.endm .endm
.macro ADD_AND_CLIP_RS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;
vrshrn.s32 \arg5, \arg0, #6
vrshrn.s32 \arg6, \arg1, #6
vqadd.s16 \arg7, \arg4
vmin.s16 \arg7, \arg7, \arg2
vmax.s16 \arg7, \arg7, \arg3
// }
.endm
#endif #endif
// r0 int16_t* block, // r0 int16_t* block,
// r1 int8_t* non_zero_count, // r1 int8_t* non_zero_count,
@@ -180,157 +115,6 @@
vst1.64 {d0-d2}, [r1] vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN svc_non_zero_count_neon
push {r2-r4}
mov r4, #3
mov r3, #64
add r2, r0, #32
pld [r0, #512]
non_zero_count_two_8x8_loop:
vld1.64 {q0, q1}, [r0,:128], r3
vld1.64 {q2, q3}, [r2,:128], r3
vld1.64 {q4, q5}, [r0,:128], r3
vld1.64 {q6, q7}, [r2,:128], r3
vld1.64 {q8, q9}, [r0,:128], r3
vld1.64 {q10, q11}, [r2,:128], r3//load #0 8x8 block resi data,
vld1.64 {q12, q13}, [r0,:128], r3
vld1.64 {q14, q15}, [r2,:128], r3//load #1 8x8 block resi data,
pld [r0, #512]
ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, d0, d1, d4, d5, d2, d3 // output q1
// vceq.i16 q1, q1, #0
ORR_32BYTES_TO_8BYTES q8, q9,q10,q11,d16,d17,d20,d21,d4,d5 // output q2
// vceq.i16 q2, q2, #0
ORR_32BYTES_TO_8BYTES q4, q5, q6, q7, d8, d9, d12, d13, d10, d11 // output q5
// vceq.i16 q5, q5, #0
ORR_32BYTES_TO_8BYTES q12,q13,q14,q15,d24,d25, d28, d29, d12, d13 // output q6
// vceq.i16 q6, q6, #0
vqmovn.u64 d0, q1 // 8bytes-->4bytes
vqmovn.u64 d8, q5
vqmovn.u64 d1, q2
vqmovn.u64 d9, q6
vqmovn.u32 d2, q0 // 4bytes-->2bytes
vqmovn.u32 d3, q4
vceq.i16 q0, q1, #0
vmvn q0, q0
vabs.s16 q2, q0
vmovn.u16 d6, q2 // 2bytes-->1bytes
vst1.u8 {d6}, [r1]!
// pld [r0]
subs r4, r4, #1
bne non_zero_count_two_8x8_loop
pop {r2-r4}
WELS_ASM_FUNC_END
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN svc_rs_non_zero_count_neon
vld1.i16 {q0, q1}, [r0]! // block is unaligned!!!
vld1.i16 {q2, q3}, [r0]!
vld1.i16 {q4, q5}, [r0]!
vld1.i16 {q6, q7}, [r0]!
vld1.i16 {q8, q9}, [r0]!
vld1.i16 {q10, q11}, [r0]!
vld1.i16 {q12, q13}, [r0]!
vld1.i16 {q14, q15}, [r0]!
ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
vorr.s16 q0, q4
vorr.s16 q1, q5 // output d0~d3
ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
vorr.s16 q6, q8, q12
vorr.s16 q7, q9, q13 // output d12~d15
vqmovn.u64 d4, q0 // 8bytes-->4bytes
vqmovn.u64 d6, q6
vqmovn.u64 d5, q1
vqmovn.u64 d7, q7
vqmovn.u32 d8, q2 // 4bytes-->2bytes
vqmovn.u32 d9, q3
vceq.i16 q5, q4, #0
vmvn q5, q5
vabs.s16 q5, q5
vmovn.u16 d10, q5 // 2bytes-->1bytes
vst1.u8 {d10}, [r1]!
vld1.i16 {q0, q1}, [r0]!
vld1.i16 {q2, q3}, [r0]!
vld1.i16 {q4, q5}, [r0]!
vld1.i16 {q6, q7}, [r0]!
vld1.i16 {q8, q9}, [r0]!
vld1.i16 {q10, q11}, [r0]!
vld1.i16 {q12, q13}, [r0]!
vld1.i16 {q14, q15}, [r0]!
ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
vorr.s16 q0, q4
vorr.s16 q1, q5 // output d0~d3
ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
vorr.s16 q6, q8, q12
vorr.s16 q7, q9, q13 // output d12~d15
vqmovn.u64 d4, q0 // 8bytes-->4bytes
vqmovn.u64 d6, q6
vqmovn.u64 d5, q1
vqmovn.u64 d7, q7
vqmovn.u32 d8, q2 // 4bytes-->2bytes
vqmovn.u32 d9, q3
vceq.i16 q5, q4, #0
vmvn q5, q5
vabs.s16 q5, q5
vmovn.u16 d10, q5 // 2bytes-->1bytes
vst1.u8 {d10}, [r1]!
// Chroma
vld1.i16 {q0, q1}, [r0]!
vld1.i16 {q2, q3}, [r0]!
vld1.i16 {q4, q5}, [r0]!
vld1.i16 {q6, q7}, [r0]! //load Cb block,
vld1.i16 {q8, q9}, [r0]!
vld1.i16 {q10, q11}, [r0]!
vld1.i16 {q12, q13}, [r0]!
vld1.i16 {q14, q15}, [r0]! //load Cr block,
ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, q4, q5, q6, q7, q4, q6
vorr.s16 q0, q2
vorr.s16 q1, q4, q6 // output d0~d3
ORR_32BYTES_TO_8BYTES q8, q9, q10, q11, q12, q13, q14, q15, q12, q14
vorr.s16 q2, q8, q10
vorr.s16 q3, q12, q14 // output d4~d7
vqmovn.u64 d8, q0 // 8bytes-->4bytes
vqmovn.u64 d10, q2
vqmovn.u64 d9, q1
vqmovn.u64 d11, q3
vqmovn.u32 d12, q4 // 4bytes-->2bytes
vqmovn.u32 d13, q5
vceq.i16 q7, q6, #0
vmvn q7, q7
vabs.s16 q7, q7
vmovn.u16 d10, q7 // 2bytes-->1bytes
vst1.u8 {d10}, [r1]!
WELS_ASM_FUNC_END
// r0 int16_t * block, // r0 int16_t * block,
// r1 int32_t stride // r1 int32_t stride
@@ -371,207 +155,6 @@ block_zero_8x8_chma_loop:
pop {r2} pop {r2}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
// r0 int8_t* dst_addr,
// r1 memset_value
// r2 int32_t bytes_nmb,
WELS_ASM_FUNC_BEGIN svc_block_memset_neon// dst should continue
vdup.u8 q0, r1
vdup.u8 q1, r1
block_memset_loop:
vst1.64 {q0, q1}, [r0,:64]!
subs r2, r2, #64
vst1.64 {q0, q1}, [r0,:64]!
bne block_memset_loop
WELS_ASM_FUNC_END
// int16_t* dst,
// int16_t* src,
// int32_t stride
WELS_ASM_FUNC_BEGIN svc_block_copy_16x16_neon
push {r3}
mov r3, #16
// each element is sizeof(int16_t)
lsl r2, r2, #1 // r2 = 2*r2
block_copy_16x16_luma_loop:
vld1.i16 {q0, q1}, [r1], r2
subs r3, r3, #1
vst1.i16 {q0, q1}, [r0]!
bne block_copy_16x16_luma_loop
pop {r3}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN svc_block_copy_8x8_neon
push {r3}
mov r3, #8
// each element is sizeof(int16_t)
lsl r2, r2, #1 // r2 = 2*r2
block_copy_8x8_chma_loop:
vld1.i16 {q0}, [r1], r2
subs r3, r3, #1
vst1.i16 {q0}, [r0]!
bne block_copy_8x8_chma_loop
pop {r3}
WELS_ASM_FUNC_END
// r0 uint8_t * dest,
// r1 uint8_t * pred,
// r2 int16_t * res,
// r3 int32_t stride,
WELS_ASM_FUNC_BEGIN svc_block_add_16x16_neon
push {r4}
mov r4, #16
pld [r1]
block_recon_16x16_luma_loop:
vld1.64 {d16,d17}, [r1,:64], r3 //load 16 pred data, update addr
vld1.s16 {q0, q1}, [r2]! //load 8+8 resi data, update addr
vld1.64 {d18,d19}, [r1,:64], r3
vld1.s16 {q2, q3}, [r2]!
ADD_PRED_1BYTE_TO_RESID_2BYTES q0, q1, q2, q3, d16, d17, d18, d19
pld [r1]
vst1.64 {q8}, [r0], r3 //store result
vst1.64 {q9}, [r0], r3
//#ifdef DEBUG_NEON
// vst1.u8 {q8}, [r0]!
// vst1.u8 {q9}, [r0]!
//#endif
vld1.64 {d20,d21}, [r1,:64], r3 //load 16 pred data, update addr
vld1.s16 {q4, q5}, [r2]! //load 8+8 resi data, update addr
vld1.64 {d22,d23}, [r1,:64], r3
vld1.s16 {q6, q7}, [r2]!
ADD_PRED_1BYTE_TO_RESID_2BYTES q4, q5, q6, q7, d20, d21, d22, d23
pld [r1]
vst1.64 {q10}, [r0], r3
vst1.64 {q11}, [r0], r3
//#ifdef DEBUG_NEON
// vst1.u8 {q10}, [r0]!
// vst1.u8 {q11}, [r0]!
//#endif
subs r4, r4, #4
bne block_recon_16x16_luma_loop
pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN svc_block_add_8x8_neon
vld1.u8 {d24}, [r1], r3 //load 8 pred data
vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12
vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d26}, [r1], r3 //load 8 pred data
vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13
ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27
pld [r1]
vst1.u8 {d24}, [r0], r3 //store result
vst1.u8 {d25}, [r0], r3 //store result
vst1.u8 {d26}, [r0], r3 //store result
vst1.u8 {d27}, [r0], r3 //store result
//#ifdef DEBUG_NEON
// vst1.u8 {d24}, [r0]!
//#endif
vld1.u8 {d24}, [r1], r3 //load 8 pred data
vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12
vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d26}, [r1], r3 //load 8 pred data
vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13
ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27
vst1.u8 {d24}, [r0], r3 //store result
vst1.u8 {d25}, [r0], r3 //store result
vst1.u8 {d26}, [r0], r3 //store result
vst1.u8 {d27}, [r0], r3 //store result
//#ifdef DEBUG_NEON
// vst1.u8 {d24}, [r0]!
//#endif
WELS_ASM_FUNC_END
// int16_t* dst,
// int16_t* src,
// int stride
WELS_ASM_FUNC_BEGIN svc_simple_idct4x4_neon
vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles!
lsl r2, r2, #1
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
vrshrn.s32 d0, q0, #6
vst1.s16 {d0}, [r0], r2 //store
vrshrn.s32 d1, q1, #6
vst1.s16 {d1}, [r0], r2 //store
vrshrn.s32 d2, q2, #6
vst1.s16 {d2}, [r0], r2 //store
vrshrn.s32 d3, q3, #6
vst1.s16 {d3}, [r0], r2 //store
WELS_ASM_FUNC_END
// int16_t* dst,
// int16_t* src,
// int stride
WELS_ASM_FUNC_BEGIN svc_idct4x4_add_neon
vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles!
lsl r2, r2, #1
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
//see draft G.8.5.3 , after clip_rs() into [-255, 255]
vmov.i16 q10,#0xFF
veor q11, q11
vsub.i16 q11, q11,q10
// vmvn.i16 q11,#0xFF
mov r1, r0
vld1.s16 {d16}, [r0], r2
vld1.s16 {d17}, [r0], r2
ADD_AND_CLIP_RS q0, q1, q10, q11, q8, d8, d9, q4
vst1.s16 {d8}, [r1], r2 //store
vst1.s16 {d9}, [r1], r2 //store
vld1.s16 {d18}, [r0], r2
vld1.s16 {d19}, [r0], r2
ADD_AND_CLIP_RS q2, q3, q10, q11, q9, d10, d11, q5
vst1.s16 {d10}, [r1], r2 //store
vst1.s16 {d11}, [r1], r2 //store
WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs // uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon WELS_ASM_FUNC_BEGIN IdctResAddPred_neon

View File

@@ -720,6 +720,7 @@ void DeblockingInit (SDeblockingFunc* pFunc, int32_t iCpu) {
#endif #endif
#if defined(HAVE_NEON) #if defined(HAVE_NEON)
if ( iCpu & WELS_CPU_NEON )
{ {
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon; pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon; pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;

View File

@@ -1150,9 +1150,11 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
#endif #endif
#ifdef HAVE_NEON #ifdef HAVE_NEON
if ( iCpu & WELS_CPU_NEON ) {
pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon; pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon;
pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon; pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon;
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon; pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
}
#endif #endif
} }
void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) { void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {

View File

@@ -146,7 +146,14 @@ void WelsDecoderDefaults (PWelsDecoderContext pCtx) {
#if defined(X86_ASM) #if defined(X86_ASM)
pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores);
#endif//X86_ASM #elif defined(HAVE_NEON)
#if defined(ANDROID_NDK)
pCtx->uiCpuFlag = WelsCPUFeatureDetectAndroid();
#endif
#if defined(APPLE_IOS)
pCtx->uiCpuFlag = WelsCPUFeatureDetectIOS();
#endif
#endif
pCtx->iImgWidthInPixel = 0; pCtx->iImgWidthInPixel = 0;
pCtx->iImgHeightInPixel = 0; // alloc picture data when picture size is available pCtx->iImgHeightInPixel = 0; // alloc picture data when picture size is available
@@ -657,6 +664,7 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_c; pCtx->pIdctResAddPredFunc = IdctResAddPred_c;
#if defined(HAVE_NEON) #if defined(HAVE_NEON)
if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
@@ -677,6 +685,7 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon; pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon;
pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon; pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon;
pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon; pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon;
}
#endif//HAVE_NEON #endif//HAVE_NEON

View File

@@ -971,8 +971,10 @@ void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
pMcFunc->pMcChromaFunc = McChroma_c; pMcFunc->pMcChromaFunc = McChroma_c;
#ifdef HAVE_NEON #ifdef HAVE_NEON
if ( iCpu & WELS_CPU_NEON ) {
pMcFunc->pMcLumaFunc = McLuma_neon; pMcFunc->pMcLumaFunc = McLuma_neon;
pMcFunc->pMcChromaFunc = McChroma_neon; pMcFunc->pMcChromaFunc = McChroma_neon;
}
#endif #endif
#if defined (X86_ASM) #if defined (X86_ASM)