Add arm asm code for processing.
This commit is contained in:
parent
248f324c62
commit
e7cc8c2780
@ -795,7 +795,7 @@ WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
|
||||
|
||||
vld1.64 {d0-d2}, [r0]
|
||||
|
||||
@ -810,38 +810,37 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#ifdef APPLE_IOS
|
||||
|
||||
.macro BS_NZC_CHECK
|
||||
.macro BS_NZC_CHECK
|
||||
vld1.8 {d0,d1}, [$0]
|
||||
/* Arrenge the input data --- TOP */
|
||||
ands r6, $1, #2
|
||||
beq bs_nzc_check_jump0
|
||||
|
||||
|
||||
sub r6, $0, $2, lsl #4
|
||||
sub r6, $2, lsl #3
|
||||
add r6, #12
|
||||
vld1.32 d3[1], [r6]
|
||||
|
||||
bs_nzc_check_jump0:
|
||||
|
||||
bs_nzc_check_jump0:
|
||||
vext.8 q1, q1, q0, #12
|
||||
vadd.u8 $3, q0, q1
|
||||
|
||||
|
||||
|
||||
/* Arrenge the input data --- LEFT */
|
||||
ands r6, $1, #1
|
||||
beq bs_nzc_check_jump1
|
||||
|
||||
|
||||
sub r6, $0, #21
|
||||
add r7, r6, #4
|
||||
add r7, r6, #4
|
||||
vld1.8 d3[4], [r6]
|
||||
add r6, r7, #4
|
||||
vld1.8 d3[5], [r7]
|
||||
add r7, r6, #4
|
||||
vld1.8 d3[6], [r6]
|
||||
vld1.8 d3[7], [r7]
|
||||
|
||||
|
||||
bs_nzc_check_jump1:
|
||||
vzip.8 d0, d1
|
||||
vzip.8 d0, d1
|
||||
vzip.8 d0, d1
|
||||
vext.8 q1, q1, q0, #12
|
||||
vadd.u8 $4, q0, q1
|
||||
@ -852,41 +851,41 @@ bs_nzc_check_jump1:
|
||||
vabd.s16 q5, $0, $1
|
||||
vabd.s16 q6, $1, $2
|
||||
vdup.s16 $0, r6
|
||||
vabd.s16 q7, $2, $3
|
||||
vabd.s16 q8, $3, $4
|
||||
|
||||
vabd.s16 q7, $2, $3
|
||||
vabd.s16 q8, $3, $4
|
||||
|
||||
vcge.s16 q5, $0
|
||||
vcge.s16 q6, $0
|
||||
vcge.s16 q7, $0
|
||||
vcge.s16 q8, $0
|
||||
|
||||
vcge.s16 q8, $0
|
||||
|
||||
vpadd.i16 d10, d10, d11
|
||||
vpadd.i16 d11, d12, d13
|
||||
vpadd.i16 d12, d14, d15
|
||||
vpadd.i16 d13, d16, d17
|
||||
|
||||
vpadd.i16 d13, d16, d17
|
||||
|
||||
vaddhn.i16 $5, q5, q5
|
||||
vaddhn.i16 $6, q6, q6
|
||||
.endm
|
||||
|
||||
.macro BS_MV_CHECK
|
||||
.macro BS_MV_CHECK
|
||||
vldm $0, {q0,q1,q2,q3}
|
||||
|
||||
/* Arrenge the input data --- TOP */
|
||||
ands r6, $1, #2
|
||||
beq bs_mv_check_jump0
|
||||
|
||||
|
||||
sub r6, $0, $2, lsl #6
|
||||
add r6, #48
|
||||
vld1.8 {d8, d9}, [r6]
|
||||
|
||||
|
||||
bs_mv_check_jump0:
|
||||
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
|
||||
|
||||
|
||||
/* Arrenge the input data --- LEFT */
|
||||
ands r6, $1, #1
|
||||
beq bs_mv_check_jump1
|
||||
|
||||
|
||||
sub r6, $0, #52
|
||||
add r7, r6, #16
|
||||
vld1.32 d8[0], [r6]
|
||||
@ -895,7 +894,7 @@ bs_mv_check_jump0:
|
||||
add r7, r6, #16
|
||||
vld1.32 d9[0], [r6]
|
||||
vld1.32 d9[1], [r7]
|
||||
|
||||
|
||||
bs_mv_check_jump1:
|
||||
vzip.32 q0, q2
|
||||
vzip.32 q1, q3
|
||||
@ -904,7 +903,6 @@ bs_mv_check_jump1:
|
||||
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
|
||||
.endm
|
||||
#else
|
||||
|
||||
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
|
||||
vld1.8 {d0,d1}, [\arg0]
|
||||
/* Arrenge the input data --- TOP */
|
||||
@ -999,40 +997,40 @@ bs_mv_check_jump1:
|
||||
.endm
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
|
||||
|
||||
|
||||
stmdb sp!, {r5-r7}
|
||||
|
||||
|
||||
ldr r5, [sp, #12] //Save BS to r5
|
||||
|
||||
|
||||
/* Checking the nzc status */
|
||||
BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
|
||||
|
||||
|
||||
/* For checking bS[I] = 2 */
|
||||
mov r6, #2
|
||||
vcgt.s8 q14, q14, #0
|
||||
vdup.u8 q0, r6
|
||||
vcgt.s8 q15, q15, #0
|
||||
|
||||
|
||||
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
|
||||
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
|
||||
|
||||
|
||||
/* Checking the mv status*/
|
||||
BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
|
||||
|
||||
|
||||
/* For checking bS[I] = 1 */
|
||||
mov r6, #1
|
||||
vdup.u8 q0, r6
|
||||
|
||||
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
|
||||
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
|
||||
|
||||
|
||||
|
||||
|
||||
/* Check bS[I] is '1' or '2' */
|
||||
vmax.u8 q1, q12, q14
|
||||
vmax.u8 q0, q13, q15
|
||||
|
||||
|
||||
//vstm r5, {q0, q1}
|
||||
vst1.32 {q0, q1}, [r5]
|
||||
ldmia sp!, {r5-r7}
|
||||
|
46
codec/common/expand_picture.S
Executable file → Normal file
46
codec/common/expand_picture.S
Executable file → Normal file
@ -34,13 +34,13 @@
|
||||
.text
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
|
||||
stmdb sp!, {r4-r8}
|
||||
//Save the dst
|
||||
mov r7, r0
|
||||
mov r8, r3
|
||||
|
||||
|
||||
add r4, r7, r2
|
||||
sub r4, #1
|
||||
//For the left and right expand
|
||||
@ -58,40 +58,40 @@ _expand_picture_luma_loop2:
|
||||
subs r8, #1
|
||||
bne _expand_picture_luma_loop2
|
||||
|
||||
//for the top and bottom expand
|
||||
//for the top and bottom expand
|
||||
add r2, #64
|
||||
sub r0, #32
|
||||
mla r4, r1, r3, r0
|
||||
sub r4, r1
|
||||
_expand_picture_luma_loop0:
|
||||
mov r5, #32
|
||||
mls r5, r5, r1, r0
|
||||
mov r5, #32
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {q0}, [r0]!
|
||||
vld1.8 {q1}, [r4]!
|
||||
|
||||
|
||||
mov r8, #32
|
||||
_expand_picture_luma_loop1:
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
_expand_picture_luma_loop1:
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_luma_loop1
|
||||
|
||||
|
||||
subs r2, #16
|
||||
bne _expand_picture_luma_loop0
|
||||
|
||||
//vldreq.32 d0, [r0]
|
||||
|
||||
|
||||
ldmia sp!, {r4-r8}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
|
||||
stmdb sp!, {r4-r8}
|
||||
//Save the dst
|
||||
mov r7, r0
|
||||
mov r8, r3
|
||||
|
||||
|
||||
add r4, r7, r2
|
||||
sub r4, #1
|
||||
//For the left and right expand
|
||||
@ -107,31 +107,31 @@ _expand_picture_chroma_loop2:
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop2
|
||||
|
||||
//for the top and bottom expand
|
||||
//for the top and bottom expand
|
||||
add r2, #32
|
||||
sub r0, #16
|
||||
mla r4, r1, r3, r0
|
||||
sub r4, r1
|
||||
_expand_picture_chroma_loop0:
|
||||
mov r5, #16
|
||||
mls r5, r5, r1, r0
|
||||
mov r5, #16
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {q0}, [r0]!
|
||||
vld1.8 {q1}, [r4]!
|
||||
|
||||
|
||||
mov r8, #16
|
||||
_expand_picture_chroma_loop1:
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
_expand_picture_chroma_loop1:
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop1
|
||||
|
||||
|
||||
subs r2, #16
|
||||
bne _expand_picture_chroma_loop0
|
||||
|
||||
//vldreq.32 d0, [r0]
|
||||
|
||||
|
||||
ldmia sp!, {r4-r8}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@ -533,7 +533,7 @@ WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDc_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column data (8 bytes)
|
||||
sub r2, r0, #1
|
||||
|
276
codec/encoder/core/arm/intra_pred_neon.S
Executable file → Normal file
276
codec/encoder/core/arm/intra_pred_neon.S
Executable file → Normal file
@ -61,25 +61,25 @@
|
||||
.endm
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
|
||||
//Get the top line data to 'q0'
|
||||
sub r3, r1, r2
|
||||
vldm r3, {d0, d1}
|
||||
|
||||
|
||||
//mov r2, #16
|
||||
mov r3, #4
|
||||
//Set the top line to the each line of MB(16*16)
|
||||
//Set the top line to the each line of MB(16*16)
|
||||
loop_0_get_i16x16_luma_pred_v:
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_v
|
||||
bne loop_0_get_i16x16_luma_pred_v
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
|
||||
//stmdb sp!, {r4, lr}
|
||||
sub r1, r1, #1
|
||||
@ -87,10 +87,10 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
|
||||
loop_0_get_i16x16_luma_pred_h:
|
||||
//Get one byte data from left side
|
||||
vld1.8 {d0[],d1[]}, [r1], r2
|
||||
vld1.8 {d2[],d3[]}, [r1], r2
|
||||
vld1.8 {d4[],d5[]}, [r1], r2
|
||||
vld1.8 {d2[],d3[]}, [r1], r2
|
||||
vld1.8 {d4[],d5[]}, [r1], r2
|
||||
vld1.8 {d6[],d7[]}, [r1], r2
|
||||
|
||||
|
||||
//Set the line of MB using the left side byte data
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
//add r0, #16
|
||||
@ -100,9 +100,9 @@ loop_0_get_i16x16_luma_pred_h:
|
||||
//add r0, #16
|
||||
vst1.8 {d6,d7}, [r0]!
|
||||
//add r0, #16
|
||||
|
||||
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_h
|
||||
bne loop_0_get_i16x16_luma_pred_h
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -113,11 +113,11 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
|
||||
sub r3, r1, #1
|
||||
GET_8BYTE_DATA d0, r3, r2
|
||||
GET_8BYTE_DATA d1, r3, r2
|
||||
|
||||
|
||||
//Get the top horizontal line data
|
||||
sub r3, r1, r2
|
||||
sub r3, r1, r2
|
||||
vldm r3, {d2, d3}
|
||||
|
||||
|
||||
//Calculate the sum of top horizontal line data and vertical line data
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
@ -125,11 +125,11 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
|
||||
vadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
//Calculate the mean value
|
||||
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
|
||||
|
||||
//Set the mean value to the all of member of MB
|
||||
mov r3, #4
|
||||
loop_0_get_i16x16_luma_pred_dc_both:
|
||||
@ -138,21 +138,21 @@ loop_0_get_i16x16_luma_pred_dc_both:
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||
|
||||
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
|
||||
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
|
||||
|
||||
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
|
||||
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
|
||||
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
|
||||
//stmdb sp!, { r4, lr}
|
||||
|
||||
|
||||
//Load the table {(8,7,6,5,4,3,2,1) * 5}
|
||||
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
|
||||
vldr d0, [r3]
|
||||
@ -161,51 +161,51 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
|
||||
sub r3, r1, r2
|
||||
sub r1, r3, #1
|
||||
vld1.8 d1, [r1]
|
||||
|
||||
|
||||
//Pack the top[8] ~ top[15] to d2
|
||||
add r1, #9
|
||||
vld1.8 d2, [r1]
|
||||
|
||||
|
||||
//Save the top[15] to d6 for next step
|
||||
vdup.u8 d6, d2[7]
|
||||
|
||||
|
||||
//Get and pack left[-1] ~ left[6] to d4
|
||||
sub r1, r3, #1
|
||||
GET_8BYTE_DATA d4, r1, r2
|
||||
|
||||
|
||||
//Get and pack left[8] ~ left[15] to d3
|
||||
add r1, r2
|
||||
GET_8BYTE_DATA d3, r1, r2
|
||||
|
||||
|
||||
//Save the left[15] to d7 for next step
|
||||
vdup.u8 d7, d3[7]
|
||||
|
||||
|
||||
//revert the sequence of d2,d3
|
||||
vrev64.8 q1, q1
|
||||
|
||||
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
|
||||
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
|
||||
|
||||
|
||||
|
||||
vmovl.u8 q0, d0
|
||||
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
|
||||
|
||||
|
||||
//Calculate the sum of items of q1, q2
|
||||
vpadd.s16 d0, d2, d3
|
||||
vpadd.s16 d1, d4, d5
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
|
||||
|
||||
//Get the value of 'b', 'c' and extend to q1, q2.
|
||||
vrshr.s64 q0, #6
|
||||
vdup.s16 q1, d0[0]
|
||||
vdup.s16 q2, d1[0]
|
||||
|
||||
|
||||
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
|
||||
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
|
||||
vld1.32 {d0}, [r3]
|
||||
|
||||
|
||||
//Get the value of 'a' and save to q3
|
||||
vaddl.u8 q3, d6, d7
|
||||
vshl.u16 q3, #4
|
||||
@ -214,57 +214,57 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
|
||||
vmovl.s8 q0, d0
|
||||
vmla.s16 q3, q0, q1
|
||||
vmla.s16 q3, q2, d0[0]
|
||||
|
||||
|
||||
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
|
||||
vshl.s16 q5, q1, #3
|
||||
vadd.s16 q5, q3
|
||||
|
||||
|
||||
//right shift 5 bits and rounding
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q5, #5
|
||||
|
||||
|
||||
//Set the line of MB
|
||||
vst1.u32 {d0,d1}, [r0]!
|
||||
|
||||
|
||||
|
||||
|
||||
//Do the same processing for setting other lines
|
||||
mov r3, #15
|
||||
loop_0_get_i16x16_luma_pred_plane:
|
||||
loop_0_get_i16x16_luma_pred_plane:
|
||||
vadd.s16 q3, q2
|
||||
vadd.s16 q5, q2
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q5, #5
|
||||
vst1.u32 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_plane
|
||||
|
||||
bne loop_0_get_i16x16_luma_pred_plane
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
ldr r3, [r3]
|
||||
|
||||
|
||||
//Set the luma MB using top line
|
||||
str r3, [r0], #4
|
||||
str r3, [r0], #4
|
||||
str r3, [r0], #4
|
||||
str r3, [r0]
|
||||
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column (4 bytes)
|
||||
sub r3, r1, #1
|
||||
vld1.8 {d0[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d3[]}, [r3]
|
||||
|
||||
|
||||
//Set the luma MB using the left side byte
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
vst1.32 {d1[0]}, [r0]!
|
||||
@ -279,36 +279,36 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
|
||||
//Load the top row data(8 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0}, [r3]
|
||||
|
||||
|
||||
//For "t7 + (t7<<1)"
|
||||
vdup.8 d1, d0[7]
|
||||
|
||||
|
||||
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
|
||||
vext.8 d1, d0, d1, #1
|
||||
vaddl.u8 q1, d1, d0
|
||||
|
||||
|
||||
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
|
||||
vext.8 q2, q1, q1, #14
|
||||
vadd.u16 q0, q1, q2
|
||||
|
||||
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q0, #2
|
||||
|
||||
|
||||
//Save "ddl0, ddl1, ddl2, ddl3"
|
||||
vext.8 d1, d0, d0, #1
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
|
||||
//Save "ddl1, ddl2, ddl3, ddl4"
|
||||
vext.8 d1, d0, d0, #2
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
|
||||
//Save "ddl2, ddl3, ddl4, ddl5"
|
||||
vext.8 d1, d0, d0, #3
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
//Save "ddl3, ddl4, ddl5, ddl6"
|
||||
vst1.32 d0[1], [r0]
|
||||
|
||||
vst1.32 d0[1], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@ -317,29 +317,29 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0[1]}, [r3]
|
||||
|
||||
|
||||
//Load the left column (5 bytes)
|
||||
sub r3, #1
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3], r2
|
||||
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
|
||||
|
||||
|
||||
|
||||
|
||||
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
|
||||
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
|
||||
|
||||
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
|
||||
vaddl.u8 q2, d2, d0
|
||||
|
||||
|
||||
//q1:{TL0+LT0,LT0+T01,...L12+L23}
|
||||
vext.8 q3, q3, q2, #14
|
||||
vadd.u16 q1, q2, q3
|
||||
|
||||
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q1, #2
|
||||
|
||||
|
||||
//Adjust the data sequence for setting luma MB of 'pred'
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
@ -358,19 +358,19 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0}, [r3]
|
||||
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
|
||||
|
||||
|
||||
vext.8 q2, q1, q1, #2
|
||||
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
|
||||
|
||||
|
||||
//calculate the "vl0,vl1,vl2,vl3,vl4"
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
|
||||
|
||||
//calculate the "vl5,vl6,vl7,vl8,vl9"
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[0], [r0]!
|
||||
vst1.32 d1[0], [r0]!
|
||||
@ -378,7 +378,7 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
|
||||
vext.8 d1, d1, d1, #1
|
||||
vst1.32 d0[0], [r0]!
|
||||
vst1.32 d1[0], [r0]
|
||||
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@ -387,34 +387,34 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0[1]}, [r3]
|
||||
|
||||
|
||||
//Load the left column (4 bytes)
|
||||
sub r3, #1
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
|
||||
|
||||
|
||||
vext.u8 q2, q1, q1, #14
|
||||
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
|
||||
|
||||
|
||||
//Calculate the vr0 ~ vr9
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[1], [r0]!
|
||||
vst1.32 d1[1], [r0]!
|
||||
//add r2, r0, r1
|
||||
vst1.8 d1[3], [r0]!
|
||||
vst1.16 d0[2], [r0]!
|
||||
vst1.16 d0[2], [r0]!
|
||||
vst1.8 d0[6], [r0]!
|
||||
vst1.8 d1[2], [r0]!
|
||||
vst1.16 d1[2], [r0]!
|
||||
vst1.16 d1[2], [r0]!
|
||||
vst1.8 d1[6], [r0]
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -426,29 +426,29 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
|
||||
mov r1, #3
|
||||
mul r1, r2
|
||||
add r1, r3
|
||||
vld1.8 {d0[]}, [r1]
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[]}, [r1]
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[5]}, [r3], r2
|
||||
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||
|
||||
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||
|
||||
vext.u8 d2, d5, d4, #2
|
||||
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||
|
||||
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||
|
||||
//Calculate the hu0 ~ hu5
|
||||
vqrshrn.u16 d2, q2, #1
|
||||
vqrshrn.u16 d1, q1, #2
|
||||
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vzip.8 d2, d1
|
||||
vst1.32 d1[0], [r0]!
|
||||
vext.8 d2, d1, d1, #2
|
||||
vext.8 d2, d1, d1, #2
|
||||
vst1.32 d2[0], [r0]!
|
||||
vst1.32 d1[1], [r0]!
|
||||
vst1.32 d0[0], [r0]
|
||||
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@ -458,22 +458,22 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
|
||||
sub r3, r1, r2
|
||||
sub r3, #1
|
||||
vld1.32 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
|
||||
|
||||
|
||||
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
|
||||
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
|
||||
|
||||
|
||||
//Calculate the hd0~hd9
|
||||
vqrshrn.u16 d1, q3, #2
|
||||
vqrshrn.u16 d0, q2, #1
|
||||
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vmov d3, d1
|
||||
vtrn.8 d0, d1
|
||||
@ -501,25 +501,25 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]
|
||||
|
||||
vst1.8 {d0}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
////Get the left column (8 byte)
|
||||
sub r3, r1, #1
|
||||
vld1.8 {d0[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d3[]}, [r3], r2
|
||||
vld1.8 {d4[]}, [r3], r2
|
||||
vld1.8 {d5[]}, [r3], r2
|
||||
vld1.8 {d6[]}, [r3], r2
|
||||
vld1.8 {d5[]}, [r3], r2
|
||||
vld1.8 {d6[]}, [r3], r2
|
||||
vld1.8 {d7[]}, [r3]
|
||||
|
||||
//Set the chroma MB using left column data
|
||||
|
||||
//Set the chroma MB using left column data
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d1}, [r0]!
|
||||
vst1.8 {d2}, [r0]!
|
||||
@ -527,8 +527,8 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
|
||||
vst1.8 {d4}, [r0]!
|
||||
vst1.8 {d5}, [r0]!
|
||||
vst1.8 {d6}, [r0]!
|
||||
vst1.8 {d7}, [r0]
|
||||
|
||||
vst1.8 {d7}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@ -536,36 +536,36 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredDc_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column data (8 bytes)
|
||||
sub r3, r1, #1
|
||||
GET_8BYTE_DATA d0, r3, r2
|
||||
|
||||
GET_8BYTE_DATA d0, r3, r2
|
||||
|
||||
//Load the top row data (8 bytes)
|
||||
sub r3, r1, r2
|
||||
sub r3, r1, r2
|
||||
vldr d1, [r3]
|
||||
|
||||
|
||||
//Calculate the sum of left column and top row
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d0, d1 //'m1' save to d2
|
||||
|
||||
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||
|
||||
|
||||
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||
|
||||
//duplicate the 'mx' to a vector line
|
||||
vdup.8 d4, d2[0]
|
||||
vdup.8 d5, d1[4]
|
||||
vdup.8 d6, d0[4]
|
||||
vdup.8 d7, d2[4]
|
||||
|
||||
//Set the chroma MB
|
||||
|
||||
//Set the chroma MB
|
||||
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||
vst2.32 {d6[0],d7[0]}, [r0]!
|
||||
vst2.32 {d6[0],d7[0]}, [r0]!
|
||||
vst2.32 {d6[0],d7[0]}, [r0]!
|
||||
vst2.32 {d6[0],d7[0]}, [r0]
|
||||
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@ -579,36 +579,36 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
|
||||
//Load the top row data
|
||||
sub r3, r1, #1
|
||||
sub r3, r2
|
||||
vld1.32 {d1[0]}, [r3]
|
||||
vld1.32 {d1[0]}, [r3]
|
||||
add r3, #5
|
||||
vld1.32 {d0[0]}, [r3]
|
||||
|
||||
|
||||
//Load the left column data
|
||||
sub r3, #5
|
||||
vld1.8 {d1[4]}, [r3], r2
|
||||
vld1.8 {d1[5]}, [r3], r2
|
||||
vld1.8 {d1[5]}, [r3], r2
|
||||
vld1.8 {d1[6]}, [r3], r2
|
||||
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||
add r3, r2
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[5]}, [r3], r2
|
||||
vld1.8 {d0[6]}, [r3], r2
|
||||
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
|
||||
|
||||
|
||||
|
||||
|
||||
//Save T7 to d3 for next step
|
||||
vdup.u8 d3, d0[3]
|
||||
//Save L7 to d4 for next step
|
||||
vdup.u8 d4, d0[7]
|
||||
|
||||
|
||||
//Calculate the value of 'a' and save to q2
|
||||
vaddl.u8 q2, d3, d4
|
||||
vshl.u16 q2, #4
|
||||
|
||||
|
||||
//Load the table {{1,2,3,4,1,2,3,4}*17}
|
||||
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d2}, [r3]
|
||||
|
||||
|
||||
//Calculate the 'b','c', and save to q0
|
||||
vrev32.8 d1, d1
|
||||
vsubl.u8 q0, d0, d1
|
||||
@ -617,32 +617,32 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
vrshr.s64 q0, #5
|
||||
|
||||
|
||||
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
|
||||
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d6, d7}, [r3]
|
||||
|
||||
|
||||
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
|
||||
vdup.s16 q1, d1[0]
|
||||
vdup.s16 q0, d0[0]
|
||||
|
||||
|
||||
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
|
||||
vmla.s16 q2, q0, q3
|
||||
vmla.s16 q2, q1, d6[0]
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
|
||||
|
||||
//Set a line of chroma MB
|
||||
vst1.u32 {d0}, [r0]!
|
||||
|
||||
|
||||
//Do the same processing for each line.
|
||||
mov r3, #7
|
||||
loop_0_get_i_chroma_pred_plane:
|
||||
loop_0_get_i_chroma_pred_plane:
|
||||
vadd.s16 q2, q1
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
vst1.u32 {d0}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i_chroma_pred_plane
|
||||
|
||||
bne loop_0_get_i_chroma_pred_plane
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
388
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
Executable file → Normal file
388
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
Executable file → Normal file
@ -29,14 +29,14 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
.text
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
|
||||
#ifdef APPLE_IOS
|
||||
//The data sequence will be used
|
||||
//The data sequence will be used
|
||||
.macro GET_8BYTE_DATA_L0
|
||||
vld1.8 {$0[0]}, [$1], $2
|
||||
vld1.8 {$0[1]}, [$1], $2
|
||||
@ -49,7 +49,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
.macro HDM_TRANSFORM_4X4_L0
|
||||
.macro HDM_TRANSFORM_4X4_L0
|
||||
|
||||
//Do the vertical transform
|
||||
vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
|
||||
@ -57,15 +57,15 @@
|
||||
vswp d1, d2
|
||||
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
|
||||
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
|
||||
|
||||
|
||||
//Do the horizontal transform
|
||||
vtrn.32 q2, q1
|
||||
vadd.s16 q0, q2, q1
|
||||
vsub.s16 q1, q2, q1
|
||||
|
||||
|
||||
vtrn.16 q0, q1
|
||||
vadd.s16 q2, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
|
||||
vmov.s16 d0, d4
|
||||
vmov.s16 d1, d2
|
||||
@ -76,9 +76,9 @@
|
||||
vtrn.32 d0, d1 //{0,1,3,2}
|
||||
vaba.s16 $5, d0, $2 //16x16_v
|
||||
vaba.s16 $5, d1, $8
|
||||
vaba.s16 $5, d5, $8
|
||||
vaba.s16 $5, d5, $8
|
||||
vadd.u16 $5, d3
|
||||
|
||||
|
||||
//16x16_h
|
||||
vtrn.16 d4, d5 //{0,4,12,8}
|
||||
vaba.s16 $6, d4, $3 //16x16_h
|
||||
@ -87,7 +87,7 @@
|
||||
vadd.u16 d2, d3
|
||||
vadd.u16 d2, d5
|
||||
vadd.u16 $6, d2
|
||||
|
||||
|
||||
//16x16_dc_both
|
||||
vaba.s16 $7, d4, $4 //16x16_dc_both
|
||||
vadd.u16 $7, d2
|
||||
@ -95,7 +95,7 @@
|
||||
.endm
|
||||
|
||||
#else
|
||||
//The data sequence will be used
|
||||
//The data sequence will be used
|
||||
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
|
||||
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||
@ -115,15 +115,15 @@
|
||||
vswp d1, d2
|
||||
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
|
||||
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
|
||||
|
||||
|
||||
//Do the horizontal transform
|
||||
vtrn.32 q2, q1
|
||||
vadd.s16 q0, q2, q1
|
||||
vsub.s16 q1, q2, q1
|
||||
|
||||
|
||||
vtrn.16 q0, q1
|
||||
vadd.s16 q2, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
|
||||
vmov.s16 d0, d4
|
||||
vmov.s16 d1, d2
|
||||
@ -134,9 +134,9 @@
|
||||
vtrn.32 d0, d1 //{0,1,3,2}
|
||||
vaba.s16 \arg5, d0, \arg2 //16x16_v
|
||||
vaba.s16 \arg5, d1, \arg8
|
||||
vaba.s16 \arg5, d5, \arg8
|
||||
vaba.s16 \arg5, d5, \arg8
|
||||
vadd.u16 \arg5, d3
|
||||
|
||||
|
||||
//16x16_h
|
||||
vtrn.16 d4, d5 //{0,4,12,8}
|
||||
vaba.s16 \arg6, d4, \arg3 //16x16_h
|
||||
@ -145,42 +145,42 @@
|
||||
vadd.u16 d2, d3
|
||||
vadd.u16 d2, d5
|
||||
vadd.u16 \arg6, d2
|
||||
|
||||
|
||||
//16x16_dc_both
|
||||
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
|
||||
vadd.u16 \arg7, d2
|
||||
.endm
|
||||
#endif
|
||||
|
||||
WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the top line data to 'q15'(16 bytes)
|
||||
sub r7, r0, r1
|
||||
vld1.8 {q15}, [r7]
|
||||
|
||||
|
||||
//Get the left colume data to 'q14' (16 bytes)
|
||||
sub r7, r0, #1
|
||||
GET_8BYTE_DATA_L0 d28, r7, r1
|
||||
GET_8BYTE_DATA_L0 d29, r7, r1
|
||||
|
||||
GET_8BYTE_DATA_L0 d29, r7, r1
|
||||
|
||||
//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
|
||||
//Calculate the 16x16_dc_both mode SATD
|
||||
//Calculate the 16x16_dc_both mode SATD
|
||||
vaddl.u8 q0, d30, d31
|
||||
vaddl.u8 q1, d28, d29
|
||||
vadd.u16 q0, q1
|
||||
vadd.u16 d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
//Calculate the mean value
|
||||
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, #5
|
||||
vshl.u16 d27, d0, #4
|
||||
|
||||
|
||||
vshl.u16 d27, d0, #4
|
||||
|
||||
|
||||
//Calculate the 16x16_v mode SATD and save to "q11, 12"
|
||||
vshll.u8 q0, d30, #2
|
||||
vshll.u8 q1, d31, #2
|
||||
vshll.u8 q1, d31, #2
|
||||
vtrn.32 q0, q1
|
||||
vadd.s16 q2, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
@ -191,7 +191,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
//{8,9,11,10, 12,13,15,14} q11
|
||||
//Calculate the 16x16_h mode SATD and save to "q9, q10"
|
||||
vshll.u8 q0, d28, #2
|
||||
vshll.u8 q1, d29, #2
|
||||
vshll.u8 q1, d29, #2
|
||||
vtrn.32 q0, q1
|
||||
vadd.s16 q2, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
@ -199,64 +199,64 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
vadd.s16 q10, q2, q1
|
||||
vsub.s16 q9, q2, q1
|
||||
vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
|
||||
//{8,9,11,10, 12,13,15,14} q9
|
||||
|
||||
//{8,9,11,10, 12,13,15,14} q9
|
||||
|
||||
vmov.i32 d17, #0//Save the SATD of DC_BOTH
|
||||
vmov.i32 d16, #0//Save the SATD of H
|
||||
vmov.i32 d15, #0//Save the SATD of V
|
||||
vmov.i32 d14, #0//For zero D register
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
vld1.32 {q3}, [r2], r3
|
||||
vld1.32 {q4}, [r2], r3
|
||||
vld1.32 {q5}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vtrn.32 q3, q4
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
|
||||
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
vld1.32 {q3}, [r2], r3
|
||||
vld1.32 {q4}, [r2], r3
|
||||
vld1.32 {q5}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vtrn.32 q3, q4
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
|
||||
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
|
||||
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
vld1.32 {q3}, [r2], r3
|
||||
vld1.32 {q4}, [r2], r3
|
||||
vld1.32 {q5}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vtrn.32 q3, q4
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
|
||||
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
|
||||
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
vld1.32 {q3}, [r2], r3
|
||||
vld1.32 {q4}, [r2], r3
|
||||
vld1.32 {q5}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vld1.32 {q6}, [r2], r3
|
||||
vtrn.32 q3, q4
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
|
||||
vtrn.32 q5, q6
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
|
||||
|
||||
//Get the data from stack
|
||||
ldr r5, [sp, #20] //the addr of Best_mode
|
||||
ldr r6, [sp, #24] //the value of i_lambda
|
||||
@ -266,19 +266,19 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
vpaddl.u16 d15, d15
|
||||
vpaddl.u32 d15, d15
|
||||
vmov.u32 r0, d15[0]
|
||||
|
||||
|
||||
//vadd.u16 d22, d23
|
||||
vrshr.u16 d16, #1
|
||||
vpaddl.u16 d16, d16
|
||||
vpaddl.u32 d16, d16
|
||||
vmov.u32 r1, d16[0]
|
||||
vmov.u32 r1, d16[0]
|
||||
add r1, r6, lsl #1
|
||||
|
||||
|
||||
//vadd.u16 d20, d21
|
||||
vrshr.u16 d17, #1
|
||||
vpaddl.u16 d17, d17
|
||||
vpaddl.u32 d17, d17
|
||||
vmov.u32 r2, d17[0]
|
||||
vmov.u32 r2, d17[0]
|
||||
add r2, r6, lsl #1
|
||||
|
||||
mov r4, #0
|
||||
@ -295,60 +295,60 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
|
||||
//Get the top line data to 'q15'(16 bytes)
|
||||
sub r4, r0, r1
|
||||
vld1.8 {q15}, [r4]
|
||||
|
||||
|
||||
//Get the left colume data to 'q14' (16 bytes)
|
||||
sub r4, r0, #1
|
||||
GET_8BYTE_DATA_L0 d28, r4, r1
|
||||
GET_8BYTE_DATA_L0 d29, r4, r1
|
||||
|
||||
GET_8BYTE_DATA_L0 d29, r4, r1
|
||||
|
||||
//Calculate the mean value and save to 'q13' (8 bytes)
|
||||
//Calculate the 16x16_dc_both mode SATD
|
||||
//Calculate the 16x16_dc_both mode SATD
|
||||
vaddl.u8 q0, d30, d31
|
||||
vaddl.u8 q1, d28, d29
|
||||
vadd.u16 q0, q1
|
||||
vadd.u16 d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
//Calculate the mean value
|
||||
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q13, d0[0]
|
||||
|
||||
|
||||
sub r4, r0, #1
|
||||
|
||||
|
||||
vmov.i32 q12, #0//Save the SATD of DC_BOTH
|
||||
vmov.i32 q11, #0//Save the SATD of H
|
||||
vmov.i32 q10, #0//Save the SATD of V
|
||||
|
||||
|
||||
mov lr, #16
|
||||
sad_intra_16x16_x3_opt_loop0:
|
||||
//Get the left colume data to 'd0' (16 bytes)
|
||||
vld1.8 {d0[]}, [r4], r1
|
||||
vld1.8 {d0[]}, [r4], r1
|
||||
|
||||
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||
vld1.8 {q1}, [r2], r3
|
||||
|
||||
|
||||
subs lr, #1
|
||||
//Do the SAD for top colume
|
||||
vabal.u8 q12, d30, d2
|
||||
vabal.u8 q12, d31, d3
|
||||
vabal.u8 q12, d31, d3
|
||||
|
||||
//Do the SAD for left colume
|
||||
vabal.u8 q11, d0, d2
|
||||
vabal.u8 q11, d0, d3
|
||||
vabal.u8 q11, d0, d3
|
||||
|
||||
//Do the SAD for mean value
|
||||
vabal.u8 q10, d26, d2
|
||||
vabal.u8 q10, d26, d3
|
||||
|
||||
vabal.u8 q10, d26, d3
|
||||
|
||||
bne sad_intra_16x16_x3_opt_loop0
|
||||
|
||||
|
||||
//Get the data from stack
|
||||
ldr r5, [sp, #20] //the addr of Best_mode
|
||||
ldr r6, [sp, #24] //the value of i_lambda
|
||||
@ -357,19 +357,19 @@ sad_intra_16x16_x3_opt_loop0:
|
||||
vpaddl.u16 d24, d24
|
||||
vpaddl.u32 d24, d24
|
||||
vmov.u32 r0, d24[0]
|
||||
|
||||
|
||||
vadd.u16 d22, d23
|
||||
vpaddl.u16 d22, d22
|
||||
vpaddl.u32 d22, d22
|
||||
vmov.u32 r1, d22[0]
|
||||
vmov.u32 r1, d22[0]
|
||||
add r1, r6, lsl #1
|
||||
|
||||
|
||||
vadd.u16 d20, d21
|
||||
vpaddl.u16 d20, d20
|
||||
vpaddl.u32 d20, d20
|
||||
vmov.u32 r2, d20[0]
|
||||
vmov.u32 r2, d20[0]
|
||||
add r2, r6, lsl #1
|
||||
|
||||
|
||||
mov r4, #0
|
||||
cmp r1, r0
|
||||
movcc r0, r1
|
||||
@ -384,120 +384,120 @@ sad_intra_16x16_x3_opt_loop0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #32] //p_dec_cr
|
||||
ldr r5, [sp, #36] //p_enc_cr
|
||||
|
||||
|
||||
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
|
||||
sub r6, r0, #1
|
||||
GET_8BYTE_DATA_L0 d28, r6, r1
|
||||
sub r6, r4, #1
|
||||
GET_8BYTE_DATA_L0 d30, r6, r1
|
||||
|
||||
sub r6, r4, #1
|
||||
GET_8BYTE_DATA_L0 d30, r6, r1
|
||||
|
||||
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
|
||||
sub r6, r0, r1
|
||||
vld1.8 {d29}, [r6]
|
||||
sub r6, r4, r1
|
||||
vld1.8 {d31}, [r6]
|
||||
|
||||
|
||||
//Calculate the sum of left column and top row
|
||||
vmov.i32 q0, q14
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d0, d1 //'m1' save to d2
|
||||
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||
|
||||
//duplicate the 'mx' to a vector line
|
||||
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||
|
||||
//duplicate the 'mx' to a vector line
|
||||
vdup.8 d27, d2[0]
|
||||
vdup.8 d26, d1[4]
|
||||
vtrn.32 d27, d26
|
||||
|
||||
|
||||
vdup.8 d26, d0[4]
|
||||
vdup.8 d25, d2[4]
|
||||
vtrn.32 d26, d25 //Save to "d27, d26"
|
||||
|
||||
|
||||
vmov.i32 q0, q15
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d0, d1 //'m1' save to d2
|
||||
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||
|
||||
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||
|
||||
//duplicate the 'mx' to a vector line
|
||||
vdup.8 d25, d2[0]
|
||||
vdup.8 d24, d1[4]
|
||||
vtrn.32 d25, d24
|
||||
|
||||
|
||||
vdup.8 d24, d0[4]
|
||||
vdup.8 d23, d2[4]
|
||||
vtrn.32 d24, d23 //Save to "d25, d24"
|
||||
|
||||
|
||||
vmov.i32 q11, #0//Save the SATD of DC_BOTH
|
||||
vmov.i32 q10, #0//Save the SATD of H
|
||||
vmov.i32 q9 , #0//Save the SATD of V
|
||||
sub r6, r0, #1
|
||||
sub r7, r4, #1
|
||||
sub r7, r4, #1
|
||||
mov lr, #4
|
||||
sad_intra_8x8_x3_opt_loop0:
|
||||
|
||||
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||
vld1.8 {d0}, [r2], r3
|
||||
vld1.8 {d1}, [r5], r3
|
||||
|
||||
|
||||
//Get the left colume data to 'd0' (16 bytes)
|
||||
vld1.8 {d2[]}, [r6], r1
|
||||
vld1.8 {d3[]}, [r7], r1
|
||||
|
||||
vld1.8 {d2[]}, [r6], r1
|
||||
vld1.8 {d3[]}, [r7], r1
|
||||
|
||||
subs lr, #1
|
||||
|
||||
|
||||
|
||||
//Do the SAD for top colume
|
||||
vabal.u8 q11, d29, d0
|
||||
vabal.u8 q11, d31, d1
|
||||
vabal.u8 q11, d29, d0
|
||||
vabal.u8 q11, d31, d1
|
||||
|
||||
//Do the SAD for left colume
|
||||
vabal.u8 q10, d2, d0
|
||||
vabal.u8 q10, d3, d1
|
||||
vabal.u8 q10, d3, d1
|
||||
|
||||
//Do the SAD for mean value
|
||||
vabal.u8 q9, d27, d0
|
||||
vabal.u8 q9, d25, d1
|
||||
|
||||
|
||||
vabal.u8 q9, d25, d1
|
||||
|
||||
|
||||
bne sad_intra_8x8_x3_opt_loop0
|
||||
|
||||
mov lr, #4
|
||||
sad_intra_8x8_x3_opt_loop1:
|
||||
|
||||
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||
vld1.8 {d0}, [r2], r3
|
||||
vld1.8 {d1}, [r5], r3
|
||||
|
||||
|
||||
//Get the left colume data to 'd0' (16 bytes)
|
||||
vld1.8 {d2[]}, [r6], r1
|
||||
vld1.8 {d3[]}, [r7], r1
|
||||
|
||||
vld1.8 {d2[]}, [r6], r1
|
||||
vld1.8 {d3[]}, [r7], r1
|
||||
|
||||
subs lr, #1
|
||||
|
||||
|
||||
|
||||
//Do the SAD for top colume
|
||||
vabal.u8 q11, d29, d0
|
||||
vabal.u8 q11, d31, d1
|
||||
vabal.u8 q11, d29, d0
|
||||
vabal.u8 q11, d31, d1
|
||||
|
||||
//Do the SAD for left colume
|
||||
vabal.u8 q10, d2, d0
|
||||
vabal.u8 q10, d3, d1
|
||||
vabal.u8 q10, d3, d1
|
||||
|
||||
//Do the SAD for mean value
|
||||
vabal.u8 q9, d26, d0
|
||||
vabal.u8 q9, d24, d1
|
||||
|
||||
|
||||
bne sad_intra_8x8_x3_opt_loop1
|
||||
vabal.u8 q9, d24, d1
|
||||
|
||||
|
||||
bne sad_intra_8x8_x3_opt_loop1
|
||||
//Get the data from stack
|
||||
ldr r5, [sp, #20] //the addr of Best_mode
|
||||
ldr r6, [sp, #24] //the value of i_lambda
|
||||
@ -505,13 +505,13 @@ sad_intra_8x8_x3_opt_loop1:
|
||||
vadd.u16 d22, d23
|
||||
vpaddl.u16 d22, d22
|
||||
vpaddl.u32 d22, d22
|
||||
vmov.u32 r0, d22[0]
|
||||
vmov.u32 r0, d22[0]
|
||||
add r0, r6, lsl #1
|
||||
|
||||
|
||||
vadd.u16 d20, d21
|
||||
vpaddl.u16 d20, d20
|
||||
vpaddl.u32 d20, d20
|
||||
vmov.u32 r1, d20[0]
|
||||
vmov.u32 r1, d20[0]
|
||||
add r1, r6, lsl #1
|
||||
|
||||
vadd.u16 d18, d19
|
||||
@ -533,28 +533,28 @@ sad_intra_8x8_x3_opt_loop1:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #32] //p_dec_cr
|
||||
ldr r5, [sp, #36] //p_enc_cr
|
||||
|
||||
|
||||
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
|
||||
sub r6, r0, r1
|
||||
vld1.8 {d29}, [r6]
|
||||
sub r6, r4, r1
|
||||
vld1.8 {d31}, [r6]
|
||||
|
||||
|
||||
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
|
||||
sub r6, r0, #1
|
||||
GET_8BYTE_DATA_L0 d28, r6, r1
|
||||
sub r6, r4, #1
|
||||
GET_8BYTE_DATA_L0 d30, r6, r1
|
||||
|
||||
sub r6, r4, #1
|
||||
GET_8BYTE_DATA_L0 d30, r6, r1
|
||||
|
||||
//Calculate the 16x16_v mode SATD and save to "q12, 13"
|
||||
vshll.u8 q0, d29, #2
|
||||
vshll.u8 q1, d31, #2
|
||||
vshll.u8 q1, d31, #2
|
||||
vtrn.32 q0, q1
|
||||
vadd.s16 q2, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
@ -565,7 +565,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
//{8,9,11,10, 12,13,15,14} q12
|
||||
//Calculate the 16x16_h mode SATD and save to "q10, q11"
|
||||
vshll.u8 q0, d28, #2
|
||||
vshll.u8 q1, d30, #2
|
||||
vshll.u8 q1, d30, #2
|
||||
vtrn.32 q0, q1
|
||||
vadd.s16 q2, q0, q1
|
||||
vsub.s16 q1, q0, q1
|
||||
@ -573,69 +573,69 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
vadd.s16 q11, q2, q1
|
||||
vsub.s16 q10, q2, q1
|
||||
vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
|
||||
//{8,9,11,10, 12,13,15,14} q10
|
||||
|
||||
//{8,9,11,10, 12,13,15,14} q10
|
||||
|
||||
//Calculate the sum of left column and top row
|
||||
//vmov.i32 q0, q14
|
||||
vpaddl.u8 q0, q14
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d0, d1
|
||||
vadd.u32 d2, d0, d1
|
||||
|
||||
vpaddl.u8 q2, q15
|
||||
vpaddl.u16 q2, q2
|
||||
vadd.u32 d3, d4, d5
|
||||
|
||||
vadd.u32 d3, d4, d5
|
||||
|
||||
vtrn.32 q0, q2
|
||||
vrshr.u32 q1, #3
|
||||
vrshr.u32 q2, #2
|
||||
vrshr.u32 q2, #2
|
||||
vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
|
||||
vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
|
||||
vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
|
||||
vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
|
||||
|
||||
|
||||
|
||||
|
||||
vmov.i32 d28, #0//Save the SATD of DC_BOTH
|
||||
vmov.i32 d10, #0//Save the SATD of H
|
||||
vmov.i32 d11, #0//Save the SATD of V
|
||||
vmov.i32 d30, #0//For zero D register
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
vld1.32 {d6}, [r2], r3
|
||||
vld1.32 {d7}, [r2], r3
|
||||
vld1.32 {d8}, [r2], r3
|
||||
vld1.32 {d9}, [r2], r3
|
||||
vld1.32 {d9}, [r2], r3
|
||||
vtrn.32 d6, d7
|
||||
vtrn.32 d8, d9
|
||||
vtrn.32 d8, d9
|
||||
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
|
||||
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
|
||||
|
||||
|
||||
vld1.32 {d6}, [r5], r3
|
||||
vld1.32 {d7}, [r5], r3
|
||||
vld1.32 {d8}, [r5], r3
|
||||
vld1.32 {d9}, [r5], r3
|
||||
vld1.32 {d9}, [r5], r3
|
||||
vtrn.32 d6, d7
|
||||
vtrn.32 d8, d9
|
||||
vtrn.32 d8, d9
|
||||
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
|
||||
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
|
||||
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
|
||||
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||
vld1.32 {d6}, [r2], r3
|
||||
vld1.32 {d7}, [r2], r3
|
||||
vld1.32 {d8}, [r2], r3
|
||||
vld1.32 {d9}, [r2], r3
|
||||
vld1.32 {d9}, [r2], r3
|
||||
vtrn.32 d6, d7
|
||||
vtrn.32 d8, d9
|
||||
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
|
||||
vtrn.32 d8, d9
|
||||
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
|
||||
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
|
||||
|
||||
|
||||
vld1.32 {d6}, [r5], r3
|
||||
vld1.32 {d7}, [r5], r3
|
||||
vld1.32 {d8}, [r5], r3
|
||||
vld1.32 {d9}, [r5], r3
|
||||
vld1.32 {d9}, [r5], r3
|
||||
vtrn.32 d6, d7
|
||||
vtrn.32 d8, d9
|
||||
vtrn.32 d8, d9
|
||||
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
|
||||
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
|
||||
|
||||
//Get the data from stack
|
||||
ldr r5, [sp, #20] //the addr of Best_mode
|
||||
ldr r6, [sp, #24] //the value of i_lambda
|
||||
@ -643,13 +643,13 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
vrshr.u16 d11, #1
|
||||
vpaddl.u16 d11, d11
|
||||
vpaddl.u32 d11, d11
|
||||
vmov.u32 lr, d11[0]
|
||||
vmov.u32 lr, d11[0]
|
||||
add lr, r6, lsl #1
|
||||
|
||||
|
||||
vrshr.u16 d10, #1
|
||||
vpaddl.u16 d10, d10
|
||||
vpaddl.u32 d10, d10
|
||||
vmov.u32 r3, d10[0]
|
||||
vmov.u32 r3, d10[0]
|
||||
add r3, r6, lsl #1
|
||||
|
||||
vrshr.u16 d28, #1
|
||||
@ -672,31 +672,31 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the top line data to 'd31[0~3]'(4 bytes)
|
||||
sub r7, r0, r1
|
||||
vld1.32 {d31[0]}, [r7]
|
||||
|
||||
|
||||
//Get the left colume data to 'd31[4~7]' (4 bytes)
|
||||
sub r7, r0, #1
|
||||
vld1.8 {d31[4]}, [r7], r1
|
||||
vld1.8 {d31[5]}, [r7], r1
|
||||
vld1.8 {d31[6]}, [r7], r1
|
||||
vld1.8 {d31[7]}, [r7], r1
|
||||
|
||||
|
||||
//Calculate the mean value and save to 'd30' (2 bytes)
|
||||
vpaddl.u8 d0, d31
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
//Calculate the mean value
|
||||
vpaddl.u32 d0, d0
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, #3
|
||||
vshl.u16 d30, d0, #4
|
||||
|
||||
vshl.u16 d30, d0, #4
|
||||
|
||||
//Calculate the 16x16_v mode SATD and save to "d29"
|
||||
//Calculate the 16x16_h mode SATD and save to "d28"
|
||||
vshll.u8 q0, d31, #2
|
||||
//Calculate the 16x16_h mode SATD and save to "d28"
|
||||
vshll.u8 q0, d31, #2
|
||||
vtrn.32 d0, d1
|
||||
vadd.s16 d2, d0, d1
|
||||
vsub.s16 d1, d0, d1
|
||||
@ -710,12 +710,12 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
|
||||
vmov.i32 d26, #0//Save the SATD of H
|
||||
vmov.i32 d25, #0//Save the SATD of V
|
||||
vmov.i32 d24, #0//For zero D register
|
||||
|
||||
//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
|
||||
|
||||
//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
|
||||
vld1.32 {d23[0]}, [r2], r3
|
||||
vld1.32 {d23[1]}, [r2], r3
|
||||
vld1.32 {d22[0]}, [r2], r3
|
||||
vld1.32 {d22[1]}, [r2], r3
|
||||
vld1.32 {d22[1]}, [r2], r3
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
|
||||
|
||||
@ -723,17 +723,17 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
|
||||
ldr r5, [sp, #28] //the value of lambda2
|
||||
ldr r6, [sp, #32] //the value of lambda1
|
||||
ldr r7, [sp, #36] //the value of lambda0
|
||||
|
||||
|
||||
vrshr.u16 d25, #1
|
||||
vpaddl.u16 d25, d25
|
||||
vpaddl.u32 d25, d25
|
||||
vmov.u32 r0, d25[0]
|
||||
vmov.u32 r0, d25[0]
|
||||
add r0, r7
|
||||
|
||||
|
||||
vrshr.u16 d26, #1
|
||||
vpaddl.u16 d26, d26
|
||||
vpaddl.u32 d26, d26
|
||||
vmov.u32 r1, d26[0]
|
||||
vmov.u32 r1, d26[0]
|
||||
add r1, r6
|
||||
|
||||
vrshr.u16 d27, #1
|
||||
@ -741,10 +741,10 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
|
||||
vpaddl.u32 d27, d27
|
||||
vmov.u32 r2, d27[0]
|
||||
add r2, r5
|
||||
|
||||
|
||||
ldr r5, [sp, #20] //p_dst
|
||||
ldr r6, [sp, #24] //the addr of Best_mode
|
||||
|
||||
ldr r6, [sp, #24] //the addr of Best_mode
|
||||
|
||||
mov r4, r0
|
||||
cmp r1, r4
|
||||
movcc r4, r1
|
||||
@ -770,8 +770,8 @@ satd_intra_4x4_x3_opt_jump0:
|
||||
vdup.8 d0, d31[4]
|
||||
vdup.8 d1, d31[5]
|
||||
vdup.8 d2, d31[6]
|
||||
vdup.8 d3, d31[7]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
|
||||
vdup.8 d3, d31[7]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
|
||||
|
||||
bl satd_intra_4x4_x3_opt_end
|
||||
satd_intra_4x4_x3_opt_jump1:
|
||||
@ -783,11 +783,11 @@ satd_intra_4x4_x3_opt_jump1:
|
||||
vst1.32 {d31[0]}, [r5]!
|
||||
vst1.32 {d31[0]}, [r5]!
|
||||
|
||||
|
||||
|
||||
satd_intra_4x4_x3_opt_end:
|
||||
mov r0, r4
|
||||
|
||||
mov r0, r4
|
||||
|
||||
ldmia sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
3926
codec/encoder/core/arm/mc_neon.S
Executable file → Normal file
3926
codec/encoder/core/arm/mc_neon.S
Executable file → Normal file
File diff suppressed because it is too large
Load Diff
2
codec/encoder/core/arm/memory_neon.S
Executable file → Normal file
2
codec/encoder/core/arm/memory_neon.S
Executable file → Normal file
@ -60,4 +60,4 @@ mem_zero_24_neon_start:
|
||||
vst1.64 {d0}, [r0]!
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
318
codec/encoder/core/arm/pixel_neon.S
Executable file → Normal file
318
codec/encoder/core/arm/pixel_neon.S
Executable file → Normal file
@ -35,73 +35,73 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
.macro SATD_16x4
|
||||
vld1.64 {q0}, [r0,:128], r1
|
||||
vld1.64 {q1}, [r2], r3
|
||||
vld1.64 {q0}, [r0,:128], r1
|
||||
vld1.64 {q1}, [r2], r3
|
||||
|
||||
vsubl.u8 q4, d0, d2
|
||||
vld1.64 {q2}, [r0,:128], r1
|
||||
vsubl.u8 q4, d0, d2
|
||||
vld1.64 {q2}, [r0,:128], r1
|
||||
|
||||
vsubl.u8 q6, d1, d3
|
||||
vld1.64 {q3}, [r2], r3
|
||||
vsubl.u8 q6, d1, d3
|
||||
vld1.64 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q5, d4, d6
|
||||
vld1.64 {q0}, [r0,:128], r1
|
||||
vsubl.u8 q5, d4, d6
|
||||
vld1.64 {q0}, [r0,:128], r1
|
||||
|
||||
vsubl.u8 q7, d5, d7
|
||||
vsubl.u8 q7, d5, d7
|
||||
vld1.64 {q1}, [r2], r3
|
||||
|
||||
vsubl.u8 q8, d0, d2
|
||||
vld1.64 {q2}, [r0,:128], r1
|
||||
vld1.64 {q2}, [r0,:128], r1
|
||||
|
||||
vsubl.u8 q10, d1, d3
|
||||
vadd.s16 q0, q4, q5
|
||||
vadd.s16 q0, q4, q5
|
||||
|
||||
vld1.64 {q3}, [r2], r3
|
||||
vsub.s16 q1, q4, q5
|
||||
vld1.64 {q3}, [r2], r3
|
||||
vsub.s16 q1, q4, q5
|
||||
|
||||
vsubl.u8 q9, d4, d6
|
||||
vsubl.u8 q11, d5, d7
|
||||
vsubl.u8 q9, d4, d6
|
||||
vsubl.u8 q11, d5, d7
|
||||
|
||||
vadd.s16 q2, q8, q9
|
||||
vsub.s16 q3, q8, q9
|
||||
vadd.s16 q2, q8, q9
|
||||
vsub.s16 q3, q8, q9
|
||||
|
||||
vadd.s16 q4, q6, q7
|
||||
vadd.s16 q4, q6, q7
|
||||
vsub.s16 q5, q6, q7
|
||||
|
||||
vadd.s16 q6, q10, q11
|
||||
vsub.s16 q7, q10, q11
|
||||
vadd.s16 q6, q10, q11
|
||||
vsub.s16 q7, q10, q11
|
||||
|
||||
vadd.s16 q8, q0, q2
|
||||
vsub.s16 q10, q0, q2
|
||||
vadd.s16 q8, q0, q2
|
||||
vsub.s16 q10, q0, q2
|
||||
|
||||
vadd.s16 q9, q4, q6
|
||||
vsub.s16 q11, q4, q6
|
||||
vadd.s16 q9, q4, q6
|
||||
vsub.s16 q11, q4, q6
|
||||
|
||||
vsub.s16 q0, q1, q3
|
||||
vadd.s16 q2, q1, q3
|
||||
vsub.s16 q0, q1, q3
|
||||
vadd.s16 q2, q1, q3
|
||||
|
||||
vsub.s16 q1, q5, q7
|
||||
vadd.s16 q3, q5, q7
|
||||
vsub.s16 q1, q5, q7
|
||||
vadd.s16 q3, q5, q7
|
||||
|
||||
vtrn.16 q8, q10
|
||||
vtrn.16 q9, q11
|
||||
vtrn.16 q8, q10
|
||||
vtrn.16 q9, q11
|
||||
|
||||
vadd.s16 q4, q8, q10
|
||||
vabd.s16 q6, q8, q10
|
||||
vadd.s16 q4, q8, q10
|
||||
vabd.s16 q6, q8, q10
|
||||
|
||||
vadd.s16 q5, q9, q11
|
||||
vabd.s16 q7, q9, q11
|
||||
vadd.s16 q5, q9, q11
|
||||
vabd.s16 q7, q9, q11
|
||||
|
||||
vabs.s16 q4, q4
|
||||
vabs.s16 q5, q5
|
||||
|
||||
vtrn.16 q0, q2
|
||||
vtrn.16 q1, q3
|
||||
vtrn.16 q0, q2
|
||||
vtrn.16 q1, q3
|
||||
|
||||
vadd.s16 q8, q0, q2
|
||||
vabd.s16 q10, q0, q2
|
||||
vadd.s16 q8, q0, q2
|
||||
vabd.s16 q10, q0, q2
|
||||
|
||||
vadd.s16 q9, q1, q3
|
||||
vadd.s16 q9, q1, q3
|
||||
vabd.s16 q11, q1, q3
|
||||
|
||||
vabs.s16 q8, q8
|
||||
@ -128,31 +128,31 @@
|
||||
vld1.64 {d1}, [r2], r3
|
||||
|
||||
vld1.64 {d2}, [r0,:64], r1
|
||||
vsubl.u8 q4, d0, d1
|
||||
vsubl.u8 q4, d0, d1
|
||||
|
||||
vld1.64 {d3}, [r2], r3
|
||||
vsubl.u8 q5, d2, d3
|
||||
vsubl.u8 q5, d2, d3
|
||||
|
||||
vld1.64 {d4}, [r0,:64], r1
|
||||
vld1.64 {d5}, [r2], r3
|
||||
|
||||
vadd.s16 q8, q4, q5
|
||||
vsubl.u8 q6, d4, d5
|
||||
vadd.s16 q8, q4, q5
|
||||
vsubl.u8 q6, d4, d5
|
||||
|
||||
vld1.64 {d6}, [r0,:64], r1
|
||||
vld1.64 {d7}, [r2], r3
|
||||
|
||||
vsubl.u8 q7, d6, d7
|
||||
vsub.s16 q9, q4, q5
|
||||
vsubl.u8 q7, d6, d7
|
||||
vsub.s16 q9, q4, q5
|
||||
|
||||
vadd.s16 q10, q6, q7
|
||||
vsub.s16 q11, q6, q7
|
||||
vadd.s16 q10, q6, q7
|
||||
vsub.s16 q11, q6, q7
|
||||
|
||||
vadd.s16 q0, q8, q10
|
||||
vsub.s16 q1, q8, q10
|
||||
vadd.s16 q0, q8, q10
|
||||
vsub.s16 q1, q8, q10
|
||||
|
||||
vsub.s16 q2, q9, q11
|
||||
vadd.s16 q3, q9, q11
|
||||
vsub.s16 q2, q9, q11
|
||||
vadd.s16 q3, q9, q11
|
||||
|
||||
vtrn.16 q0, q1
|
||||
vtrn.16 q2, q3
|
||||
@ -220,7 +220,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad16x16_neon
|
||||
|
||||
vld1.64 {q0}, [r0, :128], r1
|
||||
vld1.64 {q1}, [r2], r3
|
||||
@ -260,7 +260,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad16x8_neon
|
||||
|
||||
vld1.64 {q0}, [r0, :128], r1
|
||||
vld1.64 {q1}, [r2], r3
|
||||
@ -298,7 +298,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad8x16_neon
|
||||
|
||||
vld1.64 {d0}, [r0, :64], r1
|
||||
vld1.64 {d1}, [r2], r3
|
||||
@ -332,7 +332,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
|
||||
|
||||
vld1.64 {d0}, [r0, :64], r1
|
||||
vld1.64 {d1}, [r2], r3
|
||||
@ -364,7 +364,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Loading a horizontal line data (4 bytes)
|
||||
@ -376,23 +376,23 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
|
||||
//line 1
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usada8 lr, r4, r5, lr
|
||||
usada8 lr, r4, r5, lr
|
||||
|
||||
//line 2
|
||||
//line 2
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usada8 lr, r4, r5, lr
|
||||
|
||||
usada8 lr, r4, r5, lr
|
||||
|
||||
//line 3
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r2]
|
||||
usada8 r0, r4, r5, lr
|
||||
usada8 r0, r4, r5, lr
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
|
||||
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
@ -400,30 +400,30 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
|
||||
//Loading a horizontal line data (16 bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
|
||||
|
||||
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {q6}, [r2], r3 //save pix2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 16 bytes
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabal.u8 q15, d1, d3
|
||||
|
||||
|
||||
vabdl.u8 q13, d0, d4
|
||||
vabal.u8 q13, d1, d5
|
||||
|
||||
|
||||
vabdl.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
|
||||
vabdl.u8 q9, d0, d8
|
||||
vabal.u8 q9, d1, d9
|
||||
|
||||
vabal.u8 q9, d1, d9
|
||||
|
||||
mov lr, #15
|
||||
pixel_sad_4_16x16_loop_0:
|
||||
|
||||
@ -436,13 +436,13 @@ pixel_sad_4_16x16_loop_0:
|
||||
vabal.u8 q15, d1, d3
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q13, d0, d4
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
vabal.u8 q13, d1, d5
|
||||
subs lr, #1
|
||||
|
||||
vabal.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
|
||||
vabal.u8 q9, d0, d8
|
||||
vabal.u8 q9, d1, d9
|
||||
|
||||
@ -451,18 +451,18 @@ pixel_sad_4_16x16_loop_0:
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d26, d27
|
||||
vadd.u16 d2, d22, d23
|
||||
vadd.u16 d3, d18, d19
|
||||
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
|
||||
vshl.u32 q0, #4
|
||||
vshl.u32 q1, #4
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
@ -471,37 +471,37 @@ pixel_sad_4_16x16_loop_0:
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
|
||||
//Loading a horizontal line data (16 bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
|
||||
|
||||
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {q6}, [r2], r3 //save pix2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 16 bytes
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabal.u8 q15, d1, d3
|
||||
|
||||
|
||||
vabdl.u8 q13, d0, d4
|
||||
vabal.u8 q13, d1, d5
|
||||
|
||||
|
||||
vabdl.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
|
||||
vabdl.u8 q9, d0, d8
|
||||
vabal.u8 q9, d1, d9
|
||||
|
||||
vabal.u8 q9, d1, d9
|
||||
|
||||
mov lr, #7
|
||||
pixel_sad_4_16x8_loop_0:
|
||||
|
||||
@ -514,67 +514,67 @@ pixel_sad_4_16x8_loop_0:
|
||||
vabal.u8 q15, d1, d3
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q13, d0, d4
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||
vabal.u8 q13, d1, d5
|
||||
subs lr, #1
|
||||
|
||||
vabal.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
|
||||
vabal.u8 q9, d0, d8
|
||||
vabal.u8 q9, d1, d9
|
||||
|
||||
|
||||
bne pixel_sad_4_16x8_loop_0
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d26, d27
|
||||
vadd.u16 d2, d22, d23
|
||||
vadd.u16 d3, d18, d19
|
||||
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
|
||||
vshl.u32 q0, #4
|
||||
vshl.u32 q1, #4
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
|
||||
|
||||
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {d6}, [r2], r3 //save pix2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q15, d0, d1
|
||||
vabdl.u8 q14, d0, d2
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d0, d4
|
||||
|
||||
vabdl.u8 q12, d0, d4
|
||||
|
||||
mov lr, #15
|
||||
pixel_sad_4_8x16_loop_0:
|
||||
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vmov.8 d1, d6 //save pix2 - stride
|
||||
@ -582,7 +582,7 @@ pixel_sad_4_8x16_loop_0:
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q15, d0, d1
|
||||
|
||||
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q14, d0, d2
|
||||
@ -594,50 +594,50 @@ pixel_sad_4_8x16_loop_0:
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d28, d29
|
||||
vadd.u16 d2, d26, d27
|
||||
vadd.u16 d3, d24, d25
|
||||
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
|
||||
vshl.u32 q0, #4
|
||||
vshl.u32 q1, #4
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
|
||||
|
||||
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {d6}, [r2], r3 //save pix2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q15, d0, d1
|
||||
vabdl.u8 q14, d0, d2
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d0, d4
|
||||
|
||||
vabdl.u8 q12, d0, d4
|
||||
|
||||
mov lr, #7
|
||||
pixel_sad_4_8x8_loop_0:
|
||||
|
||||
@ -648,7 +648,7 @@ pixel_sad_4_8x8_loop_0:
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q15, d0, d1
|
||||
|
||||
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q14, d0, d2
|
||||
@ -659,84 +659,84 @@ pixel_sad_4_8x8_loop_0:
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d28, d29
|
||||
vadd.u16 d2, d26, d27
|
||||
vadd.u16 d3, d24, d25
|
||||
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
|
||||
vshl.u32 q0, #4
|
||||
vshl.u32 q1, #4
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
|
||||
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
vld1.32 {d1[0]}, [r0], r1
|
||||
vld1.32 {d1[1]}, [r0]
|
||||
|
||||
|
||||
|
||||
|
||||
sub r0, r2, r3
|
||||
vld1.32 {d2[0]}, [r0], r3
|
||||
vld1.32 {d2[1]}, [r0], r3
|
||||
vld1.32 {d3[0]}, [r0], r3
|
||||
vld1.32 {d3[1]}, [r0], r3
|
||||
vld1.32 {d4[0]}, [r0], r3
|
||||
vld1.32 {d4[1]}, [r0]
|
||||
|
||||
sub r0, r2, #1
|
||||
vld1.32 {d4[1]}, [r0]
|
||||
|
||||
sub r0, r2, #1
|
||||
vld1.32 {d5[0]}, [r0], r3
|
||||
vld1.32 {d5[1]}, [r0], r3
|
||||
vld1.32 {d6[0]}, [r0], r3
|
||||
vld1.32 {d6[1]}, [r0]
|
||||
|
||||
add r0, r2, #1
|
||||
vld1.32 {d6[1]}, [r0]
|
||||
|
||||
add r0, r2, #1
|
||||
vld1.32 {d7[0]}, [r0], r3
|
||||
vld1.32 {d7[1]}, [r0], r3
|
||||
vld1.32 {d8[0]}, [r0], r3
|
||||
vld1.32 {d8[1]}, [r0]
|
||||
|
||||
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabdl.u8 q14, d1, d3
|
||||
|
||||
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d1, d4
|
||||
|
||||
|
||||
vabdl.u8 q11, d0, d5
|
||||
vabdl.u8 q10, d1, d6
|
||||
|
||||
|
||||
vabdl.u8 q9, d0, d7
|
||||
vabdl.u8 q8, d1, d8
|
||||
|
||||
|
||||
//Save SAD to 'r4'
|
||||
ldr r0, [sp]
|
||||
vadd.u16 q0, q14, q15
|
||||
vadd.u16 q1, q12, q13
|
||||
vadd.u16 q2, q10, q11
|
||||
vadd.u16 q3, q8 , q9
|
||||
|
||||
|
||||
vadd.u16 d0, d1
|
||||
vadd.u16 d1, d2, d3
|
||||
vadd.u16 d2, d4, d5
|
||||
vadd.u16 d3, d6, d7
|
||||
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
|
||||
vshl.u32 q0, #4
|
||||
vshl.u32 q1, #4
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
@ -744,7 +744,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd16x16_neon
|
||||
|
||||
SATD_16x4
|
||||
vadd.u16 q15, q0, q2
|
||||
@ -769,7 +769,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd16x8_neon
|
||||
|
||||
SATD_16x4
|
||||
vadd.u16 q15, q0, q2
|
||||
@ -786,7 +786,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd8x16_neon
|
||||
|
||||
SATD_8x4
|
||||
vadd.u16 q15, q0, q1
|
||||
@ -811,7 +811,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd8x8_neon
|
||||
|
||||
SATD_8x4
|
||||
vadd.u16 q15, q0, q1
|
||||
@ -828,7 +828,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
|
||||
|
||||
//Load the pix1 data --- 16 bytes
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
@ -836,11 +836,11 @@ WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
|
||||
vld1.32 {d1[0]}, [r0], r1
|
||||
vld1.32 {d1[1]}, [r0]
|
||||
|
||||
//Load the pix2 data --- 16 bytes
|
||||
//Load the pix2 data --- 16 bytes
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d3[0]}, [r2], r3
|
||||
vld1.32 {d3[1]}, [r2]
|
||||
vld1.32 {d3[1]}, [r2]
|
||||
|
||||
//Get the difference
|
||||
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
|
||||
@ -861,15 +861,15 @@ WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
|
||||
vtrn.16 q13, q12
|
||||
vadd.s16 q15, q13, q12
|
||||
|
||||
//Do the SAD
|
||||
vabs.s16 q15, q15
|
||||
//Do the SAD
|
||||
vabs.s16 q15, q15
|
||||
vabd.s16 q14, q13, q12
|
||||
|
||||
vadd.u16 q0, q15, q14
|
||||
|
||||
vrhadd.u16 d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
vmov.u32 r0, d0[0]
|
||||
|
||||
|
2624
codec/encoder/core/arm/reconstruct_neon.S
Executable file → Normal file
2624
codec/encoder/core/arm/reconstruct_neon.S
Executable file → Normal file
File diff suppressed because it is too large
Load Diff
@ -110,6 +110,33 @@ int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int3
|
||||
|
||||
#endif//X86_ASM
|
||||
|
||||
#if defined (HAVE_NEON)
|
||||
|
||||
int32_t WelsSampleSad4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
|
||||
void WelsSampleSadFour16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
|
||||
int32_t WelsSampleSatd8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
|
||||
int32_t WelsIntra16x16Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsIntra16x16Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsIntra8x8Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
|
||||
int32_t WelsIntra8x8Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
|
||||
int32_t WelsIntra4x4Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
|
@ -482,6 +482,33 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
|
||||
#endif //(X86_ASM)
|
||||
|
||||
#if defined (HAVE_NEON)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace WelsSVCEnc
|
||||
|
@ -231,6 +231,11 @@ void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag)
|
||||
pfVar = SampleVariance16x16_sse2;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_NEON
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
pfVar = SampleVariance16x16_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
|
||||
|
@ -62,6 +62,11 @@ VarFunc SampleVariance16x16_sse2;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
VarFunc SampleVariance16x16_neon;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
class CAdaptiveQuantization : public IStrategy {
|
||||
public:
|
||||
|
42
codec/processing/src/arm/adaptive_quantization.S
Executable file → Normal file
42
codec/processing/src/arm/adaptive_quantization.S
Executable file → Normal file
@ -35,7 +35,7 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
#ifdef APPLE_IOS
|
||||
.macro SQR_ADD_16BYTES
|
||||
.macro SQR_ADD_16BYTES
|
||||
vmull.u8 q3, $0, $0
|
||||
vmull.u8 q8, $1, $1
|
||||
vpadal.u16 $2, q3
|
||||
@ -51,23 +51,23 @@
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
|
||||
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
|
||||
stmdb sp!, {r4}
|
||||
|
||||
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
|
||||
|
||||
|
||||
vabd.u8 q13, q14, q15
|
||||
|
||||
|
||||
vabd.u8 q13, q14, q15
|
||||
vmull.u8 q12, d27, d27
|
||||
vmull.u8 q11, d26, d26
|
||||
vaddl.u16 q12, d24, d25
|
||||
vpadal.u16 q12, q11 //sqr
|
||||
|
||||
vaddl.u8 q13, d26, d27 //sum
|
||||
|
||||
vaddl.u8 q13, d26, d27 //sum
|
||||
|
||||
vaddl.u8 q10, d28, d29 //sum_cur
|
||||
|
||||
|
||||
vmull.u8 q9, d29, d29
|
||||
vmull.u8 q8, d28, d28
|
||||
vaddl.u16 q9, d18, d19 //sqr_cur
|
||||
@ -78,35 +78,35 @@ pixel_var_16x16_loop0:
|
||||
|
||||
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
|
||||
|
||||
|
||||
vabd.u8 q2, q0, q1
|
||||
|
||||
|
||||
//q10 save sum_cur
|
||||
vpadal.u8 q10, q1
|
||||
|
||||
//q12 save sqr
|
||||
SQR_ADD_16BYTES d4, d5, q12
|
||||
|
||||
|
||||
//q13 save sum
|
||||
vpadal.u8 q13, q2
|
||||
|
||||
subs r4, #1
|
||||
|
||||
//q9 save sqr_cur
|
||||
SQR_ADD_16BYTES d2, d3, q9
|
||||
|
||||
bne pixel_var_16x16_loop0
|
||||
|
||||
|
||||
//q9 save sqr_cur
|
||||
SQR_ADD_16BYTES d2, d3, q9
|
||||
|
||||
bne pixel_var_16x16_loop0
|
||||
|
||||
vadd.u16 d0, d26, d27 //sum
|
||||
vadd.u16 d1, d20, d21 //sum_cur
|
||||
vadd.u16 d1, d20, d21 //sum_cur
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d24, d25 //sqr
|
||||
vadd.u32 d3, d18, d19 //sqr_cur
|
||||
vpadd.u32 d0, d0, d1
|
||||
vpadd.u32 d1, d2, d3
|
||||
|
||||
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
|
||||
vshr.u32 q0, q0, #8
|
||||
vmul.u32 d0, d0
|
||||
vsub.u32 d0, d1, d0
|
||||
@ -117,4 +117,4 @@ pixel_var_16x16_loop0:
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
138
codec/processing/src/arm/down_sample_neon.S
Executable file → Normal file
138
codec/processing/src/arm/down_sample_neon.S
Executable file → Normal file
@ -35,29 +35,29 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
|
||||
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
lsr r5, #1
|
||||
|
||||
lsr r5, #1
|
||||
|
||||
//Save the tailer for the unasigned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
|
||||
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_loop0:
|
||||
comp_ds_bilinear_loop0:
|
||||
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
@ -70,9 +70,9 @@ comp_ds_bilinear_loop0:
|
||||
vrhadd.u16 q1, q3
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
vst1.32 {q0}, [r0]!
|
||||
add lr, #32
|
||||
|
||||
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r3, lsl #1
|
||||
@ -82,10 +82,10 @@ comp_ds_bilinear_loop0:
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_loop0
|
||||
|
||||
|
||||
//restore the tailer for the unasigned size
|
||||
vst1.32 {q15}, [r0]
|
||||
|
||||
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -96,29 +96,29 @@ WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
|
||||
lsr r5, #1
|
||||
|
||||
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x8_loop0:
|
||||
|
||||
comp_ds_bilinear_w_x8_loop0:
|
||||
|
||||
lsr r6, r4, #3
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x8_loop1:
|
||||
|
||||
|
||||
vld1.8 {d0}, [r2]!
|
||||
vld1.8 {d1}, [r7]!
|
||||
vld1.8 {d1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vrshr.u16 q0, #1
|
||||
vrhadd.u16 d0, d1
|
||||
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x8_loop1
|
||||
|
||||
@ -126,7 +126,7 @@ comp_ds_bilinear_w_x8_loop1:
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x8_loop0
|
||||
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -137,31 +137,31 @@ WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
|
||||
lsr r5, #1
|
||||
|
||||
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x16_loop0:
|
||||
|
||||
comp_ds_bilinear_w_x16_loop0:
|
||||
|
||||
lsr r6, r4, #4
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x16_loop1:
|
||||
|
||||
|
||||
vld1.8 {q0}, [r2]!
|
||||
vld1.8 {q1}, [r7]!
|
||||
vld1.8 {q1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrhadd.u16 q0, q1
|
||||
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0}, [r0]!
|
||||
vst1.32 {d0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x16_loop1
|
||||
|
||||
@ -169,34 +169,34 @@ comp_ds_bilinear_w_x16_loop1:
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x16_loop0
|
||||
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
|
||||
lsr r5, #1
|
||||
|
||||
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x32_loop0:
|
||||
|
||||
comp_ds_bilinear_w_x32_loop0:
|
||||
|
||||
lsr r6, r4, #5
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x32_loop1:
|
||||
|
||||
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
@ -207,10 +207,10 @@ comp_ds_bilinear_w_x32_loop1:
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
vst1.32 {q0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x32_loop1
|
||||
|
||||
@ -218,14 +218,14 @@ comp_ds_bilinear_w_x32_loop1:
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x32_loop0
|
||||
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
|
||||
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
|
||||
stmdb sp!, {r4-r12, lr}
|
||||
|
||||
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
ldr r5, [sp, #44] //the value of src_stride
|
||||
@ -245,11 +245,11 @@ WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
|
||||
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
|
||||
mov r11, #-1
|
||||
mul r11, r9 // r11 -vinc
|
||||
|
||||
|
||||
vdup.s16 d2, r9
|
||||
vdup.s16 d3, r11
|
||||
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
|
||||
|
||||
|
||||
mov r11, #0x40000000
|
||||
mov r12, #0x4000
|
||||
sub r12, #1
|
||||
@ -261,13 +261,13 @@ WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
|
||||
sub r11, #1
|
||||
vdup.s16 d9, r11
|
||||
vext.8 d7, d9, d8, #4 //init v 16384 16384 16383 16383
|
||||
|
||||
veor q14, q14
|
||||
sub r1, r2 // stride - width
|
||||
|
||||
veor q14, q14
|
||||
sub r1, r2 // stride - width
|
||||
mov r8, #16384 // yInverse
|
||||
sub r3, #1
|
||||
|
||||
_HEIGHT:
|
||||
|
||||
_HEIGHT:
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
mov r11, r8
|
||||
lsr r11, #15
|
||||
@ -275,8 +275,8 @@ _HEIGHT:
|
||||
add r11, r4 // get current row address
|
||||
mov r12, r11
|
||||
add r12, r5
|
||||
|
||||
mov r9, #16384 // xInverse
|
||||
|
||||
mov r9, #16384 // xInverse
|
||||
sub r10, r2, #1
|
||||
vmov.s16 d6, d1
|
||||
|
||||
@ -288,21 +288,21 @@ _WIDTH:
|
||||
add r4, r12,lr
|
||||
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
|
||||
vzip.32 d28, d29 //q14: 000d000c000b000a;
|
||||
|
||||
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||
|
||||
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||
vmull.u32 q12, d26,d28
|
||||
vmlal.u32 q12, d27,d29
|
||||
vqadd.u64 d24, d24,d25
|
||||
vrshr.u64 d24, #30
|
||||
|
||||
vst1.8 {d24[0]}, [r0]!
|
||||
add r9, r6
|
||||
add r9, r6
|
||||
vadd.u16 d6, d0 // inc u
|
||||
vshl.u16 d6, #1
|
||||
vshr.u16 d6, #1
|
||||
subs r10, #1
|
||||
bne _WIDTH
|
||||
|
||||
|
||||
WIDTH_END:
|
||||
lsr r9, #15
|
||||
add r4,r11,r9
|
||||
@ -317,26 +317,26 @@ WIDTH_END:
|
||||
subs r3, #1
|
||||
bne _HEIGHT
|
||||
|
||||
LAST_ROW:
|
||||
LAST_ROW:
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
lsr r8, #15
|
||||
mul r8, r5
|
||||
add r4, r8 // get current row address
|
||||
add r4, r8 // get current row address
|
||||
mov r9, #16384
|
||||
|
||||
_LAST_ROW_WIDTH:
|
||||
mov r11, r9
|
||||
lsr r11, #15
|
||||
|
||||
|
||||
add r3, r4,r11
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
vst1.8 {d0[0]}, [r0]
|
||||
add r0, #1
|
||||
add r9, r6
|
||||
vst1.8 {d0[0]}, [r0]
|
||||
add r0, #1
|
||||
add r9, r6
|
||||
subs r2, #1
|
||||
bne _LAST_ROW_WIDTH
|
||||
|
||||
|
||||
ldmia sp!, {r4-r12, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
14
codec/processing/src/arm/pixel_sad_neon.S
Executable file → Normal file
14
codec/processing/src/arm/pixel_sad_neon.S
Executable file → Normal file
@ -35,24 +35,24 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
|
||||
stmdb sp!, {lr}
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q1, d0, d1
|
||||
|
||||
|
||||
mov lr, #7
|
||||
pixel_sad_8x8_loop0:
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
|
||||
subs lr, #1
|
||||
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q1, d0, d1
|
||||
bne pixel_sad_8x8_loop0
|
||||
@ -65,4 +65,4 @@ pixel_sad_8x8_loop0:
|
||||
ldmia sp!, {lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
560
codec/processing/src/arm/vaa_calc_neon.S
Executable file → Normal file
560
codec/processing/src/arm/vaa_calc_neon.S
Executable file → Normal file
File diff suppressed because it is too large
Load Diff
@ -75,6 +75,16 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
|
||||
}
|
||||
#endif//X86_ASM
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
|
||||
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
|
||||
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
|
||||
|
@ -103,7 +103,20 @@ void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDst
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
// iSrcWidth no limitation
|
||||
HalveDownsampleFunc DyadicBilinearDownsampler_neon;
|
||||
// iSrcWidth = x32 pixels
|
||||
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon;
|
||||
|
||||
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
|
||||
|
||||
void GeneralBilinearAccurateDownsampler_neon( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
|
||||
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
|
||||
class CDownsampling : public IStrategy {
|
||||
|
@ -229,4 +229,14 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
|
||||
//}
|
||||
#endif //X86_ASM
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
void GeneralBilinearAccurateDownsamplerWrap_neon(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
|
||||
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
|
||||
const int32_t kiScaleBit = 15;
|
||||
const uint32_t kuiScale = (1 << kiScaleBit);
|
||||
uint32_t uiScalex = (uint32_t)((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
|
||||
uint32_t uiScaley = (uint32_t)((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
|
||||
GeneralBilinearAccurateDownsampler_neon(pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
|
||||
}
|
||||
#endif
|
||||
WELSVP_NAMESPACE_END
|
||||
|
@ -130,6 +130,12 @@ void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad, int32_t iCpuFlag)
|
||||
pfSad = WelsSampleSad8x8_sse21;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
if (iCpuFlag & WELS_CPU_NEON) {
|
||||
pfSad = WelsSampleSad8x8_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -60,6 +60,12 @@ SadFunc WelsSampleSad8x8_sse21;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
SadFunc WelsSampleSad8x8_neon;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
WELSVP_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
@ -65,6 +65,15 @@ void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
|
||||
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
#ifdef HAVE_NEON
|
||||
if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
|
||||
sVaaFuncs.pfVAACalcSad = VAACalcSad_neon;
|
||||
sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_neon;
|
||||
sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_neon;
|
||||
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_neon;
|
||||
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_neon;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
}
|
||||
|
||||
EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
|
||||
|
@ -103,6 +103,16 @@ VAACalcSadSsdFunc VAACalcSadSsd_sse2;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
WELSVP_EXTERN_C_BEGIN
|
||||
VAACalcSadBgdFunc VAACalcSadBgd_neon;
|
||||
VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_neon;
|
||||
VAACalcSadFunc VAACalcSad_neon;
|
||||
VAACalcSadVarFunc VAACalcSadVar_neon;
|
||||
VAACalcSadSsdFunc VAACalcSadSsd_neon;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
class CVAACalculation : public IStrategy {
|
||||
public:
|
||||
CVAACalculation (int32_t iCpuFlag);
|
||||
|
Loading…
Reference in New Issue
Block a user