Add arm asm code for processing.
This commit is contained in:
parent
248f324c62
commit
e7cc8c2780
@ -795,7 +795,7 @@ WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
|
||||
|
||||
vld1.64 {d0-d2}, [r0]
|
||||
|
||||
@ -810,7 +810,6 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#ifdef APPLE_IOS
|
||||
|
||||
.macro BS_NZC_CHECK
|
||||
vld1.8 {d0,d1}, [$0]
|
||||
/* Arrenge the input data --- TOP */
|
||||
@ -904,7 +903,6 @@ bs_mv_check_jump1:
|
||||
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
|
||||
.endm
|
||||
#else
|
||||
|
||||
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
|
||||
vld1.8 {d0,d1}, [\arg0]
|
||||
/* Arrenge the input data --- TOP */
|
||||
|
0
codec/common/expand_picture.S
Executable file → Normal file
0
codec/common/expand_picture.S
Executable file → Normal file
@ -533,7 +533,7 @@ WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDc_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column data (8 bytes)
|
||||
sub r2, r0, #1
|
||||
|
0
codec/encoder/core/arm/intra_pred_neon.S
Executable file → Normal file
0
codec/encoder/core/arm/intra_pred_neon.S
Executable file → Normal file
10
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
Executable file → Normal file
10
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
Executable file → Normal file
@ -152,7 +152,7 @@
|
||||
.endm
|
||||
#endif
|
||||
|
||||
WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the top line data to 'q15'(16 bytes)
|
||||
@ -295,7 +295,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the top line data to 'q15'(16 bytes)
|
||||
@ -384,7 +384,7 @@ sad_intra_16x16_x3_opt_loop0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the data from stack
|
||||
@ -533,7 +533,7 @@ sad_intra_8x8_x3_opt_loop1:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the data from stack
|
||||
@ -672,7 +672,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the top line data to 'd31[0~3]'(4 bytes)
|
||||
|
0
codec/encoder/core/arm/mc_neon.S
Executable file → Normal file
0
codec/encoder/core/arm/mc_neon.S
Executable file → Normal file
0
codec/encoder/core/arm/memory_neon.S
Executable file → Normal file
0
codec/encoder/core/arm/memory_neon.S
Executable file → Normal file
30
codec/encoder/core/arm/pixel_neon.S
Executable file → Normal file
30
codec/encoder/core/arm/pixel_neon.S
Executable file → Normal file
@ -220,7 +220,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad16x16_neon
|
||||
|
||||
vld1.64 {q0}, [r0, :128], r1
|
||||
vld1.64 {q1}, [r2], r3
|
||||
@ -260,7 +260,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad16x8_neon
|
||||
|
||||
vld1.64 {q0}, [r0, :128], r1
|
||||
vld1.64 {q1}, [r2], r3
|
||||
@ -298,7 +298,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad8x16_neon
|
||||
|
||||
vld1.64 {d0}, [r0, :64], r1
|
||||
vld1.64 {d1}, [r2], r3
|
||||
@ -332,7 +332,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
|
||||
|
||||
vld1.64 {d0}, [r0, :64], r1
|
||||
vld1.64 {d1}, [r2], r3
|
||||
@ -364,7 +364,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Loading a horizontal line data (4 bytes)
|
||||
@ -392,7 +392,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
|
||||
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
@ -471,7 +471,7 @@ pixel_sad_4_16x16_loop_0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
@ -548,7 +548,7 @@ pixel_sad_4_16x8_loop_0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
@ -614,7 +614,7 @@ pixel_sad_4_8x16_loop_0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
@ -679,7 +679,7 @@ pixel_sad_4_8x8_loop_0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
|
||||
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
@ -744,7 +744,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd16x16_neon
|
||||
|
||||
SATD_16x4
|
||||
vadd.u16 q15, q0, q2
|
||||
@ -769,7 +769,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd16x8_neon
|
||||
|
||||
SATD_16x4
|
||||
vadd.u16 q15, q0, q2
|
||||
@ -786,7 +786,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd8x16_neon
|
||||
|
||||
SATD_8x4
|
||||
vadd.u16 q15, q0, q1
|
||||
@ -811,7 +811,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd8x8_neon
|
||||
|
||||
SATD_8x4
|
||||
vadd.u16 q15, q0, q1
|
||||
@ -828,7 +828,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
|
||||
|
||||
//Load the pix1 data --- 16 bytes
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
|
0
codec/encoder/core/arm/reconstruct_neon.S
Executable file → Normal file
0
codec/encoder/core/arm/reconstruct_neon.S
Executable file → Normal file
@ -110,6 +110,33 @@ int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int3
|
||||
|
||||
#endif//X86_ASM
|
||||
|
||||
#if defined (HAVE_NEON)
|
||||
|
||||
int32_t WelsSampleSad4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
|
||||
void WelsSampleSadFour16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
|
||||
int32_t WelsSampleSatd8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
|
||||
int32_t WelsIntra16x16Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsIntra16x16Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsIntra8x8Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
|
||||
int32_t WelsIntra8x8Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
|
||||
int32_t WelsIntra4x4Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
|
@ -482,6 +482,33 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
|
||||
#endif //(X86_ASM)
|
||||
|
||||
#if defined (HAVE_NEON)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace WelsSVCEnc
|
||||
|
@ -231,6 +231,11 @@ void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag)
|
||||
pfVar = SampleVariance16x16_sse2;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_NEON
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
pfVar = SampleVariance16x16_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
|
||||
|
@ -62,6 +62,11 @@ VarFunc SampleVariance16x16_sse2;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
VarFunc SampleVariance16x16_neon;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
class CAdaptiveQuantization : public IStrategy {
|
||||
public:
|
||||
|
2
codec/processing/src/arm/adaptive_quantization.S
Executable file → Normal file
2
codec/processing/src/arm/adaptive_quantization.S
Executable file → Normal file
@ -51,7 +51,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
|
||||
stmdb sp!, {r4}
|
||||
|
||||
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||
|
6
codec/processing/src/arm/down_sample_neon.S
Executable file → Normal file
6
codec/processing/src/arm/down_sample_neon.S
Executable file → Normal file
@ -35,7 +35,7 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
|
||||
//Get the width and height
|
||||
@ -174,7 +174,7 @@ comp_ds_bilinear_w_x16_loop1:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
@ -223,7 +223,7 @@ comp_ds_bilinear_w_x32_loop1:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
|
||||
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
|
||||
stmdb sp!, {r4-r12, lr}
|
||||
|
||||
//Get the data from stack
|
||||
|
2
codec/processing/src/arm/pixel_sad_neon.S
Executable file → Normal file
2
codec/processing/src/arm/pixel_sad_neon.S
Executable file → Normal file
@ -35,7 +35,7 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
|
||||
stmdb sp!, {lr}
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
|
10
codec/processing/src/arm/vaa_calc_neon.S
Executable file → Normal file
10
codec/processing/src/arm/vaa_calc_neon.S
Executable file → Normal file
@ -96,7 +96,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN vaa_calc_sad_neon
|
||||
WELS_ASM_FUNC_BEGIN VAACalcSad_neon
|
||||
|
||||
stmdb sp!, {r4-r8}
|
||||
|
||||
@ -252,7 +252,7 @@ WELS_ASM_FUNC_END
|
||||
.endm
|
||||
#endif
|
||||
|
||||
WELS_ASM_FUNC_BEGIN vaa_calc_sad_bgd_neon
|
||||
WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
|
||||
|
||||
stmdb sp!, {r4-r10}
|
||||
|
||||
@ -633,7 +633,7 @@ WELS_ASM_FUNC_END
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_bgd_neon
|
||||
WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
|
||||
stmdb sp!, {r0-r12, r14}
|
||||
|
||||
ldr r4, [sp, #56] //r4 keeps the pic_stride
|
||||
@ -910,7 +910,7 @@ WELS_ASM_FUNC_END
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN vaa_calc_sad_var_neon
|
||||
WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
|
||||
stmdb sp!, {r4-r11}
|
||||
|
||||
ldr r4, [sp, #32] //r4 keeps the pic_stride
|
||||
@ -1078,7 +1078,7 @@ WELS_ASM_FUNC_END
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_neon
|
||||
WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
|
||||
stmdb sp!, {r4-r12}
|
||||
|
||||
ldr r4, [sp, #36] //r4 keeps the pic_stride
|
||||
|
@ -75,6 +75,16 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
}
|
||||
#endif//X86_ASM
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
|
||||
|
@ -103,7 +103,20 @@ void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDst
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
// iSrcWidth no limitation
|
||||
HalveDownsampleFunc DyadicBilinearDownsampler_neon;
|
||||
// iSrcWidth = x32 pixels
|
||||
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon;
|
||||
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
|
||||
void GeneralBilinearAccurateDownsampler_neon( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
|
||||
class CDownsampling : public IStrategy {
|
||||
|
@ -229,4 +229,14 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
|
||||
//}
|
||||
#endif //X86_ASM
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
void GeneralBilinearAccurateDownsamplerWrap_neon(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
const int32_t kiScaleBit = 15;
|
||||
const uint32_t kuiScale = (1 << kiScaleBit);
|
||||
uint32_t uiScalex = (uint32_t)((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
|
||||
uint32_t uiScaley = (uint32_t)((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
|
||||
GeneralBilinearAccurateDownsampler_neon(pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
|
||||
}
|
||||
#endif
|
||||
WELSVP_NAMESPACE_END
|
||||
|
@ -130,6 +130,12 @@ void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad, int32_t iCpuFlag)
|
||||
pfSad = WelsSampleSad8x8_sse21;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
pfSad = WelsSampleSad8x8_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -60,6 +60,12 @@ SadFunc WelsSampleSad8x8_sse21;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
SadFunc WelsSampleSad8x8_neon;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
WELSVP_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
@ -65,6 +65,15 @@ void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
|
||||
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
#ifdef HAVE_NEON
|
||||
if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
|
||||
sVaaFuncs.pfVAACalcSad = VAACalcSad_neon;
|
||||
sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_neon;
|
||||
sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_neon;
|
||||
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_neon;
|
||||
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_neon;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
}
|
||||
|
||||
EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
|
||||
|
@ -103,6 +103,16 @@ VAACalcSadSsdFunc VAACalcSadSsd_sse2;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
VAACalcSadBgdFunc VAACalcSadBgd_neon;
|
||||
VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_neon;
|
||||
VAACalcSadFunc VAACalcSad_neon;
|
||||
VAACalcSadVarFunc VAACalcSadVar_neon;
|
||||
VAACalcSadSsdFunc VAACalcSadSsd_neon;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
class CVAACalculation : public IStrategy {
|
||||
public:
|
||||
CVAACalculation (int32_t iCpuFlag);
|
||||
|
Loading…
Reference in New Issue
Block a user