Add arm asm code for processing.

This commit is contained in:
Licai Guo 2014-03-05 16:54:05 +08:00
parent 248f324c62
commit e7cc8c2780
24 changed files with 4330 additions and 4204 deletions

View File

@ -795,7 +795,7 @@ WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
vld1.64 {d0-d2}, [r0]
@ -810,38 +810,37 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
WELS_ASM_FUNC_END
#ifdef APPLE_IOS
.macro BS_NZC_CHECK
.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4
sub r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_nzc_check_jump1
sub r6, $0, #21
add r7, r6, #4
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 $4, q0, q1
@ -852,41 +851,41 @@ bs_nzc_check_jump1:
vabd.s16 q5, $0, $1
vabd.s16 q6, $1, $2
vdup.s16 $0, r6
vabd.s16 q7, $2, $3
vabd.s16 q8, $3, $4
vabd.s16 q7, $2, $3
vabd.s16 q8, $3, $4
vcge.s16 q5, $0
vcge.s16 q6, $0
vcge.s16 q7, $0
vcge.s16 q8, $0
vcge.s16 q8, $0
vpadd.i16 d10, d10, d11
vpadd.i16 d11, d12, d13
vpadd.i16 d12, d14, d15
vpadd.i16 d13, d16, d17
vpadd.i16 d13, d16, d17
vaddhn.i16 $5, q5, q5
vaddhn.i16 $6, q6, q6
.endm
.macro BS_MV_CHECK
.macro BS_MV_CHECK
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_mv_check_jump1
sub r6, $0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
@ -895,7 +894,7 @@ bs_mv_check_jump0:
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
@ -904,7 +903,6 @@ bs_mv_check_jump1:
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
#else
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
vld1.8 {d0,d1}, [\arg0]
/* Arrenge the input data --- TOP */
@ -999,40 +997,40 @@ bs_mv_check_jump1:
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
stmdb sp!, {r5-r7}
ldr r5, [sp, #12] //Save BS to r5
/* Checking the nzc status */
BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
/* For checking bS[I] = 2 */
mov r6, #2
vcgt.s8 q14, q14, #0
vdup.u8 q0, r6
vcgt.s8 q15, q15, #0
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
/* Checking the mv status*/
BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
/* For checking bS[I] = 1 */
mov r6, #1
vdup.u8 q0, r6
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
/* Check bS[I] is '1' or '2' */
vmax.u8 q1, q12, q14
vmax.u8 q0, q13, q15
//vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
ldmia sp!, {r5-r7}

46
codec/common/expand_picture.S Executable file → Normal file
View File

@ -34,13 +34,13 @@
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
add r4, r7, r2
sub r4, #1
//For the left and right expand
@ -58,40 +58,40 @@ _expand_picture_luma_loop2:
subs r8, #1
bne _expand_picture_luma_loop2
//for the top and bottom expand
//for the top and bottom expand
add r2, #64
sub r0, #32
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_luma_loop0:
mov r5, #32
mls r5, r5, r1, r0
mov r5, #32
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r8, #32
_expand_picture_luma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
_expand_picture_luma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_luma_loop1
subs r2, #16
bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0]
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
add r4, r7, r2
sub r4, #1
//For the left and right expand
@ -107,31 +107,31 @@ _expand_picture_chroma_loop2:
subs r8, #1
bne _expand_picture_chroma_loop2
//for the top and bottom expand
//for the top and bottom expand
add r2, #32
sub r0, #16
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_chroma_loop0:
mov r5, #16
mls r5, r5, r1, r0
mov r5, #16
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r8, #16
_expand_picture_chroma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
_expand_picture_chroma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop1
subs r2, #16
bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0]
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
#endif
#endif

View File

@ -533,7 +533,7 @@ WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r2, r0, #1

276
codec/encoder/core/arm/intra_pred_neon.S Executable file → Normal file
View File

@ -61,25 +61,25 @@
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
//Get the top line data to 'q0'
sub r3, r1, r2
vldm r3, {d0, d1}
//mov r2, #16
mov r3, #4
//Set the top line to the each line of MB(16*16)
//Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
//stmdb sp!, {r4, lr}
sub r1, r1, #1
@ -87,10 +87,10 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r1], r2
vld1.8 {d2[],d3[]}, [r1], r2
vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d2[],d3[]}, [r1], r2
vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d6[],d7[]}, [r1], r2
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0]!
//add r0, #16
@ -100,9 +100,9 @@ loop_0_get_i16x16_luma_pred_h:
//add r0, #16
vst1.8 {d6,d7}, [r0]!
//add r0, #16
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
@ -113,11 +113,11 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d1, r3, r2
//Get the top horizontal line data
sub r3, r1, r2
sub r3, r1, r2
vldm r3, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
@ -125,11 +125,11 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB
mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both:
@ -138,21 +138,21 @@ loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_dc_both
bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
//stmdb sp!, { r4, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r3]
@ -161,51 +161,51 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
sub r3, r1, r2
sub r1, r3, #1
vld1.8 d1, [r1]
//Pack the top[8] ~ top[15] to d2
add r1, #9
vld1.8 d2, [r1]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4
sub r1, r3, #1
GET_8BYTE_DATA d4, r1, r2
//Get and pack left[8] ~ left[15] to d3
add r1, r2
GET_8BYTE_DATA d3, r1, r2
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//revert the sequence of d2,d3
vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r3]
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
@ -214,57 +214,57 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q5, q1, #3
vadd.s16 q5, q3
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
//Set the line of MB
vst1.u32 {d0,d1}, [r0]!
//Do the same processing for setting other lines
mov r3, #15
loop_0_get_i16x16_luma_pred_plane:
loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q5, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
vst1.u32 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_plane
bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
ldr r3, [r3]
//Set the luma MB using top line
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0]!
vst1.32 {d1[0]}, [r0]!
@ -279,36 +279,36 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
//Load the top row data(8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0]!
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0]!
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0]!
vst1.32 d1[0], [r0]!
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
@ -317,29 +317,29 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//Load the left column (5 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3], r2
vld1.8 {d0[0]}, [r3], r2
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
@ -358,19 +358,19 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
sub r3, r1, r2
vld1.32 {d0}, [r3]
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]!
@ -378,7 +378,7 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
@ -387,34 +387,34 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//Load the left column (4 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3]
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3]
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0]!
vst1.32 d1[1], [r0]!
//add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r0]!
vst1.16 d1[2], [r0]!
vst1.16 d1[2], [r0]!
vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END
@ -426,29 +426,29 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
mov r1, #3
mul r1, r2
add r1, r3
vld1.8 {d0[]}, [r1]
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[]}, [r1]
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0]!
vext.8 d2, d1, d1, #2
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0]!
vst1.32 d1[1], [r0]!
vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
@ -458,22 +458,22 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
sub r3, r1, r2
sub r3, #1
vld1.32 {d0[1]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
@ -501,25 +501,25 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]
vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3], r2
vld1.8 {d4[]}, [r3], r2
vld1.8 {d5[]}, [r3], r2
vld1.8 {d6[]}, [r3], r2
vld1.8 {d5[]}, [r3], r2
vld1.8 {d6[]}, [r3], r2
vld1.8 {d7[]}, [r3]
//Set the chroma MB using left column data
//Set the chroma MB using left column data
vst1.8 {d0}, [r0]!
vst1.8 {d1}, [r0]!
vst1.8 {d2}, [r0]!
@ -527,8 +527,8 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
vst1.8 {d4}, [r0]!
vst1.8 {d5}, [r0]!
vst1.8 {d6}, [r0]!
vst1.8 {d7}, [r0]
vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
@ -536,36 +536,36 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d0, r3, r2
//Load the top row data (8 bytes)
sub r3, r1, r2
sub r3, r1, r2
vldr d1, [r3]
//Calculate the sum of left column and top row
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d4, d2[0]
vdup.8 d5, d1[4]
vdup.8 d6, d0[4]
vdup.8 d7, d2[4]
//Set the chroma MB
//Set the chroma MB
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]
WELS_ASM_FUNC_END
@ -579,36 +579,36 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
//Load the top row data
sub r3, r1, #1
sub r3, r2
vld1.32 {d1[0]}, [r3]
vld1.32 {d1[0]}, [r3]
add r3, #5
vld1.32 {d0[0]}, [r3]
//Load the left column data
sub r3, #5
vld1.8 {d1[4]}, [r3], r2
vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[6]}, [r3], r2
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r3, r2
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r3]
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
@ -617,32 +617,32 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r3]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB
vst1.u32 {d0}, [r0]!
//Do the same processing for each line.
mov r3, #7
loop_0_get_i_chroma_pred_plane:
loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0]!
subs r3, #1
bne loop_0_get_i_chroma_pred_plane
bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
#endif

388
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S Executable file → Normal file
View File

@ -29,14 +29,14 @@
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
//The data sequence will be used
//The data sequence will be used
.macro GET_8BYTE_DATA_L0
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
@ -49,7 +49,7 @@
.endm
.macro HDM_TRANSFORM_4X4_L0
.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
@ -57,15 +57,15 @@
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
@ -76,9 +76,9 @@
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 $5, d0, $2 //16x16_v
vaba.s16 $5, d1, $8
vaba.s16 $5, d5, $8
vaba.s16 $5, d5, $8
vadd.u16 $5, d3
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 $6, d4, $3 //16x16_h
@ -87,7 +87,7 @@
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 $6, d2
//16x16_dc_both
vaba.s16 $7, d4, $4 //16x16_dc_both
vadd.u16 $7, d2
@ -95,7 +95,7 @@
.endm
#else
//The data sequence will be used
//The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
@ -115,15 +115,15 @@
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
@ -134,9 +134,9 @@
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 \arg5, d0, \arg2 //16x16_v
vaba.s16 \arg5, d1, \arg8
vaba.s16 \arg5, d5, \arg8
vaba.s16 \arg5, d5, \arg8
vadd.u16 \arg5, d3
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 \arg6, d4, \arg3 //16x16_h
@ -145,42 +145,42 @@
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 \arg6, d2
//16x16_dc_both
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
vadd.u16 \arg7, d2
.endm
#endif
WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'q15'(16 bytes)
sub r7, r0, r1
vld1.8 {q15}, [r7]
//Get the left colume data to 'q14' (16 bytes)
sub r7, r0, #1
GET_8BYTE_DATA_L0 d28, r7, r1
GET_8BYTE_DATA_L0 d29, r7, r1
GET_8BYTE_DATA_L0 d29, r7, r1
//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
//Calculate the 16x16_dc_both mode SATD
//Calculate the 16x16_dc_both mode SATD
vaddl.u8 q0, d30, d31
vaddl.u8 q1, d28, d29
vadd.u16 q0, q1
vadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
//Calculate the mean value
vrshr.u16 d0, #5
vshl.u16 d27, d0, #4
vshl.u16 d27, d0, #4
//Calculate the 16x16_v mode SATD and save to "q11, 12"
vshll.u8 q0, d30, #2
vshll.u8 q1, d31, #2
vshll.u8 q1, d31, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@ -191,7 +191,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
//{8,9,11,10, 12,13,15,14} q11
//Calculate the 16x16_h mode SATD and save to "q9, q10"
vshll.u8 q0, d28, #2
vshll.u8 q1, d29, #2
vshll.u8 q1, d29, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@ -199,64 +199,64 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
vadd.s16 q10, q2, q1
vsub.s16 q9, q2, q1
vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
//{8,9,11,10, 12,13,15,14} q9
//{8,9,11,10, 12,13,15,14} q9
vmov.i32 d17, #0//Save the SATD of DC_BOTH
vmov.i32 d16, #0//Save the SATD of H
vmov.i32 d15, #0//Save the SATD of V
vmov.i32 d14, #0//For zero D register
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@ -266,19 +266,19 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
vpaddl.u16 d15, d15
vpaddl.u32 d15, d15
vmov.u32 r0, d15[0]
//vadd.u16 d22, d23
vrshr.u16 d16, #1
vpaddl.u16 d16, d16
vpaddl.u32 d16, d16
vmov.u32 r1, d16[0]
vmov.u32 r1, d16[0]
add r1, r6, lsl #1
//vadd.u16 d20, d21
vrshr.u16 d17, #1
vpaddl.u16 d17, d17
vpaddl.u32 d17, d17
vmov.u32 r2, d17[0]
vmov.u32 r2, d17[0]
add r2, r6, lsl #1
mov r4, #0
@ -295,60 +295,60 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'q15'(16 bytes)
sub r4, r0, r1
vld1.8 {q15}, [r4]
//Get the left colume data to 'q14' (16 bytes)
sub r4, r0, #1
GET_8BYTE_DATA_L0 d28, r4, r1
GET_8BYTE_DATA_L0 d29, r4, r1
GET_8BYTE_DATA_L0 d29, r4, r1
//Calculate the mean value and save to 'q13' (8 bytes)
//Calculate the 16x16_dc_both mode SATD
//Calculate the 16x16_dc_both mode SATD
vaddl.u8 q0, d30, d31
vaddl.u8 q1, d28, d29
vadd.u16 q0, q1
vadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q13, d0[0]
sub r4, r0, #1
vmov.i32 q12, #0//Save the SATD of DC_BOTH
vmov.i32 q11, #0//Save the SATD of H
vmov.i32 q10, #0//Save the SATD of V
mov lr, #16
sad_intra_16x16_x3_opt_loop0:
//Get the left colume data to 'd0' (16 bytes)
vld1.8 {d0[]}, [r4], r1
vld1.8 {d0[]}, [r4], r1
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {q1}, [r2], r3
subs lr, #1
//Do the SAD for top colume
vabal.u8 q12, d30, d2
vabal.u8 q12, d31, d3
vabal.u8 q12, d31, d3
//Do the SAD for left colume
vabal.u8 q11, d0, d2
vabal.u8 q11, d0, d3
vabal.u8 q11, d0, d3
//Do the SAD for mean value
vabal.u8 q10, d26, d2
vabal.u8 q10, d26, d3
vabal.u8 q10, d26, d3
bne sad_intra_16x16_x3_opt_loop0
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@ -357,19 +357,19 @@ sad_intra_16x16_x3_opt_loop0:
vpaddl.u16 d24, d24
vpaddl.u32 d24, d24
vmov.u32 r0, d24[0]
vadd.u16 d22, d23
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
vmov.u32 r1, d22[0]
vmov.u32 r1, d22[0]
add r1, r6, lsl #1
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
vmov.u32 r2, d20[0]
vmov.u32 r2, d20[0]
add r2, r6, lsl #1
mov r4, #0
cmp r1, r0
movcc r0, r1
@ -384,120 +384,120 @@ sad_intra_16x16_x3_opt_loop0:
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
//Get the data from stack
ldr r4, [sp, #32] //p_dec_cr
ldr r5, [sp, #36] //p_enc_cr
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
sub r6, r0, #1
GET_8BYTE_DATA_L0 d28, r6, r1
sub r6, r4, #1
GET_8BYTE_DATA_L0 d30, r6, r1
sub r6, r4, #1
GET_8BYTE_DATA_L0 d30, r6, r1
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
sub r6, r0, r1
vld1.8 {d29}, [r6]
sub r6, r4, r1
vld1.8 {d31}, [r6]
//Calculate the sum of left column and top row
vmov.i32 q0, q14
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d27, d2[0]
vdup.8 d26, d1[4]
vtrn.32 d27, d26
vdup.8 d26, d0[4]
vdup.8 d25, d2[4]
vtrn.32 d26, d25 //Save to "d27, d26"
vmov.i32 q0, q15
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d25, d2[0]
vdup.8 d24, d1[4]
vtrn.32 d25, d24
vdup.8 d24, d0[4]
vdup.8 d23, d2[4]
vtrn.32 d24, d23 //Save to "d25, d24"
vmov.i32 q11, #0//Save the SATD of DC_BOTH
vmov.i32 q10, #0//Save the SATD of H
vmov.i32 q9 , #0//Save the SATD of V
sub r6, r0, #1
sub r7, r4, #1
sub r7, r4, #1
mov lr, #4
sad_intra_8x8_x3_opt_loop0:
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
vld1.8 {d2[]}, [r6], r1
vld1.8 {d3[]}, [r7], r1
vld1.8 {d2[]}, [r6], r1
vld1.8 {d3[]}, [r7], r1
subs lr, #1
//Do the SAD for top colume
vabal.u8 q11, d29, d0
vabal.u8 q11, d31, d1
vabal.u8 q11, d29, d0
vabal.u8 q11, d31, d1
//Do the SAD for left colume
vabal.u8 q10, d2, d0
vabal.u8 q10, d3, d1
vabal.u8 q10, d3, d1
//Do the SAD for mean value
vabal.u8 q9, d27, d0
vabal.u8 q9, d25, d1
vabal.u8 q9, d25, d1
bne sad_intra_8x8_x3_opt_loop0
mov lr, #4
sad_intra_8x8_x3_opt_loop1:
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
vld1.8 {d2[]}, [r6], r1
vld1.8 {d3[]}, [r7], r1
vld1.8 {d2[]}, [r6], r1
vld1.8 {d3[]}, [r7], r1
subs lr, #1
//Do the SAD for top colume
vabal.u8 q11, d29, d0
vabal.u8 q11, d31, d1
vabal.u8 q11, d29, d0
vabal.u8 q11, d31, d1
//Do the SAD for left colume
vabal.u8 q10, d2, d0
vabal.u8 q10, d3, d1
vabal.u8 q10, d3, d1
//Do the SAD for mean value
vabal.u8 q9, d26, d0
vabal.u8 q9, d24, d1
bne sad_intra_8x8_x3_opt_loop1
vabal.u8 q9, d24, d1
bne sad_intra_8x8_x3_opt_loop1
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@ -505,13 +505,13 @@ sad_intra_8x8_x3_opt_loop1:
vadd.u16 d22, d23
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
vmov.u32 r0, d22[0]
vmov.u32 r0, d22[0]
add r0, r6, lsl #1
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
vmov.u32 r1, d20[0]
vmov.u32 r1, d20[0]
add r1, r6, lsl #1
vadd.u16 d18, d19
@ -533,28 +533,28 @@ sad_intra_8x8_x3_opt_loop1:
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
//Get the data from stack
ldr r4, [sp, #32] //p_dec_cr
ldr r5, [sp, #36] //p_enc_cr
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
sub r6, r0, r1
vld1.8 {d29}, [r6]
sub r6, r4, r1
vld1.8 {d31}, [r6]
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
sub r6, r0, #1
GET_8BYTE_DATA_L0 d28, r6, r1
sub r6, r4, #1
GET_8BYTE_DATA_L0 d30, r6, r1
sub r6, r4, #1
GET_8BYTE_DATA_L0 d30, r6, r1
//Calculate the 16x16_v mode SATD and save to "q12, 13"
vshll.u8 q0, d29, #2
vshll.u8 q1, d31, #2
vshll.u8 q1, d31, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@ -565,7 +565,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
//{8,9,11,10, 12,13,15,14} q12
//Calculate the 16x16_h mode SATD and save to "q10, q11"
vshll.u8 q0, d28, #2
vshll.u8 q1, d30, #2
vshll.u8 q1, d30, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@ -573,69 +573,69 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
vadd.s16 q11, q2, q1
vsub.s16 q10, q2, q1
vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
//{8,9,11,10, 12,13,15,14} q10
//{8,9,11,10, 12,13,15,14} q10
//Calculate the sum of left column and top row
//vmov.i32 q0, q14
vpaddl.u8 q0, q14
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1
vadd.u32 d2, d0, d1
vpaddl.u8 q2, q15
vpaddl.u16 q2, q2
vadd.u32 d3, d4, d5
vadd.u32 d3, d4, d5
vtrn.32 q0, q2
vrshr.u32 q1, #3
vrshr.u32 q2, #2
vrshr.u32 q2, #2
vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
vmov.i32 d28, #0//Save the SATD of DC_BOTH
vmov.i32 d10, #0//Save the SATD of H
vmov.i32 d11, #0//Save the SATD of V
vmov.i32 d30, #0//For zero D register
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {d6}, [r2], r3
vld1.32 {d7}, [r2], r3
vld1.32 {d8}, [r2], r3
vld1.32 {d9}, [r2], r3
vld1.32 {d9}, [r2], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
vld1.32 {d6}, [r5], r3
vld1.32 {d7}, [r5], r3
vld1.32 {d8}, [r5], r3
vld1.32 {d9}, [r5], r3
vld1.32 {d9}, [r5], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {d6}, [r2], r3
vld1.32 {d7}, [r2], r3
vld1.32 {d8}, [r2], r3
vld1.32 {d9}, [r2], r3
vld1.32 {d9}, [r2], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
vld1.32 {d6}, [r5], r3
vld1.32 {d7}, [r5], r3
vld1.32 {d8}, [r5], r3
vld1.32 {d9}, [r5], r3
vld1.32 {d9}, [r5], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@ -643,13 +643,13 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
vrshr.u16 d11, #1
vpaddl.u16 d11, d11
vpaddl.u32 d11, d11
vmov.u32 lr, d11[0]
vmov.u32 lr, d11[0]
add lr, r6, lsl #1
vrshr.u16 d10, #1
vpaddl.u16 d10, d10
vpaddl.u32 d10, d10
vmov.u32 r3, d10[0]
vmov.u32 r3, d10[0]
add r3, r6, lsl #1
vrshr.u16 d28, #1
@ -672,31 +672,31 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'd31[0~3]'(4 bytes)
sub r7, r0, r1
vld1.32 {d31[0]}, [r7]
//Get the left colume data to 'd31[4~7]' (4 bytes)
sub r7, r0, #1
vld1.8 {d31[4]}, [r7], r1
vld1.8 {d31[5]}, [r7], r1
vld1.8 {d31[6]}, [r7], r1
vld1.8 {d31[7]}, [r7], r1
//Calculate the mean value and save to 'd30' (2 bytes)
vpaddl.u8 d0, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, #3
vshl.u16 d30, d0, #4
vshl.u16 d30, d0, #4
//Calculate the 16x16_v mode SATD and save to "d29"
//Calculate the 16x16_h mode SATD and save to "d28"
vshll.u8 q0, d31, #2
//Calculate the 16x16_h mode SATD and save to "d28"
vshll.u8 q0, d31, #2
vtrn.32 d0, d1
vadd.s16 d2, d0, d1
vsub.s16 d1, d0, d1
@ -710,12 +710,12 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
vmov.i32 d26, #0//Save the SATD of H
vmov.i32 d25, #0//Save the SATD of V
vmov.i32 d24, #0//For zero D register
//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
vld1.32 {d23[0]}, [r2], r3
vld1.32 {d23[1]}, [r2], r3
vld1.32 {d22[0]}, [r2], r3
vld1.32 {d22[1]}, [r2], r3
vld1.32 {d22[1]}, [r2], r3
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
@ -723,17 +723,17 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
ldr r5, [sp, #28] //the value of lambda2
ldr r6, [sp, #32] //the value of lambda1
ldr r7, [sp, #36] //the value of lambda0
vrshr.u16 d25, #1
vpaddl.u16 d25, d25
vpaddl.u32 d25, d25
vmov.u32 r0, d25[0]
vmov.u32 r0, d25[0]
add r0, r7
vrshr.u16 d26, #1
vpaddl.u16 d26, d26
vpaddl.u32 d26, d26
vmov.u32 r1, d26[0]
vmov.u32 r1, d26[0]
add r1, r6
vrshr.u16 d27, #1
@ -741,10 +741,10 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
vpaddl.u32 d27, d27
vmov.u32 r2, d27[0]
add r2, r5
ldr r5, [sp, #20] //p_dst
ldr r6, [sp, #24] //the addr of Best_mode
ldr r6, [sp, #24] //the addr of Best_mode
mov r4, r0
cmp r1, r4
movcc r4, r1
@ -770,8 +770,8 @@ satd_intra_4x4_x3_opt_jump0:
vdup.8 d0, d31[4]
vdup.8 d1, d31[5]
vdup.8 d2, d31[6]
vdup.8 d3, d31[7]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
vdup.8 d3, d31[7]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump1:
@ -783,11 +783,11 @@ satd_intra_4x4_x3_opt_jump1:
vst1.32 {d31[0]}, [r5]!
vst1.32 {d31[0]}, [r5]!
satd_intra_4x4_x3_opt_end:
mov r0, r4
mov r0, r4
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
#endif
#endif

3926
codec/encoder/core/arm/mc_neon.S Executable file → Normal file

File diff suppressed because it is too large Load Diff

2
codec/encoder/core/arm/memory_neon.S Executable file → Normal file
View File

@ -60,4 +60,4 @@ mem_zero_24_neon_start:
vst1.64 {d0}, [r0]!
WELS_ASM_FUNC_END
#endif
#endif

318
codec/encoder/core/arm/pixel_neon.S Executable file → Normal file
View File

@ -35,73 +35,73 @@
#include "arm_arch_common_macro.S"
.macro SATD_16x4
vld1.64 {q0}, [r0,:128], r1
vld1.64 {q1}, [r2], r3
vld1.64 {q0}, [r0,:128], r1
vld1.64 {q1}, [r2], r3
vsubl.u8 q4, d0, d2
vld1.64 {q2}, [r0,:128], r1
vsubl.u8 q4, d0, d2
vld1.64 {q2}, [r0,:128], r1
vsubl.u8 q6, d1, d3
vld1.64 {q3}, [r2], r3
vsubl.u8 q6, d1, d3
vld1.64 {q3}, [r2], r3
vsubl.u8 q5, d4, d6
vld1.64 {q0}, [r0,:128], r1
vsubl.u8 q5, d4, d6
vld1.64 {q0}, [r0,:128], r1
vsubl.u8 q7, d5, d7
vsubl.u8 q7, d5, d7
vld1.64 {q1}, [r2], r3
vsubl.u8 q8, d0, d2
vld1.64 {q2}, [r0,:128], r1
vld1.64 {q2}, [r0,:128], r1
vsubl.u8 q10, d1, d3
vadd.s16 q0, q4, q5
vadd.s16 q0, q4, q5
vld1.64 {q3}, [r2], r3
vsub.s16 q1, q4, q5
vld1.64 {q3}, [r2], r3
vsub.s16 q1, q4, q5
vsubl.u8 q9, d4, d6
vsubl.u8 q11, d5, d7
vsubl.u8 q9, d4, d6
vsubl.u8 q11, d5, d7
vadd.s16 q2, q8, q9
vsub.s16 q3, q8, q9
vadd.s16 q2, q8, q9
vsub.s16 q3, q8, q9
vadd.s16 q4, q6, q7
vadd.s16 q4, q6, q7
vsub.s16 q5, q6, q7
vadd.s16 q6, q10, q11
vsub.s16 q7, q10, q11
vadd.s16 q6, q10, q11
vsub.s16 q7, q10, q11
vadd.s16 q8, q0, q2
vsub.s16 q10, q0, q2
vadd.s16 q8, q0, q2
vsub.s16 q10, q0, q2
vadd.s16 q9, q4, q6
vsub.s16 q11, q4, q6
vadd.s16 q9, q4, q6
vsub.s16 q11, q4, q6
vsub.s16 q0, q1, q3
vadd.s16 q2, q1, q3
vsub.s16 q0, q1, q3
vadd.s16 q2, q1, q3
vsub.s16 q1, q5, q7
vadd.s16 q3, q5, q7
vsub.s16 q1, q5, q7
vadd.s16 q3, q5, q7
vtrn.16 q8, q10
vtrn.16 q9, q11
vtrn.16 q8, q10
vtrn.16 q9, q11
vadd.s16 q4, q8, q10
vabd.s16 q6, q8, q10
vadd.s16 q4, q8, q10
vabd.s16 q6, q8, q10
vadd.s16 q5, q9, q11
vabd.s16 q7, q9, q11
vadd.s16 q5, q9, q11
vabd.s16 q7, q9, q11
vabs.s16 q4, q4
vabs.s16 q5, q5
vtrn.16 q0, q2
vtrn.16 q1, q3
vtrn.16 q0, q2
vtrn.16 q1, q3
vadd.s16 q8, q0, q2
vabd.s16 q10, q0, q2
vadd.s16 q8, q0, q2
vabd.s16 q10, q0, q2
vadd.s16 q9, q1, q3
vadd.s16 q9, q1, q3
vabd.s16 q11, q1, q3
vabs.s16 q8, q8
@ -128,31 +128,31 @@
vld1.64 {d1}, [r2], r3
vld1.64 {d2}, [r0,:64], r1
vsubl.u8 q4, d0, d1
vsubl.u8 q4, d0, d1
vld1.64 {d3}, [r2], r3
vsubl.u8 q5, d2, d3
vsubl.u8 q5, d2, d3
vld1.64 {d4}, [r0,:64], r1
vld1.64 {d5}, [r2], r3
vadd.s16 q8, q4, q5
vsubl.u8 q6, d4, d5
vadd.s16 q8, q4, q5
vsubl.u8 q6, d4, d5
vld1.64 {d6}, [r0,:64], r1
vld1.64 {d7}, [r2], r3
vsubl.u8 q7, d6, d7
vsub.s16 q9, q4, q5
vsubl.u8 q7, d6, d7
vsub.s16 q9, q4, q5
vadd.s16 q10, q6, q7
vsub.s16 q11, q6, q7
vadd.s16 q10, q6, q7
vsub.s16 q11, q6, q7
vadd.s16 q0, q8, q10
vsub.s16 q1, q8, q10
vadd.s16 q0, q8, q10
vsub.s16 q1, q8, q10
vsub.s16 q2, q9, q11
vadd.s16 q3, q9, q11
vsub.s16 q2, q9, q11
vadd.s16 q3, q9, q11
vtrn.16 q0, q1
vtrn.16 q2, q3
@ -220,7 +220,7 @@
.endm
WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
WELS_ASM_FUNC_BEGIN WelsSampleSad16x16_neon
vld1.64 {q0}, [r0, :128], r1
vld1.64 {q1}, [r2], r3
@ -260,7 +260,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSad16x8_neon
vld1.64 {q0}, [r0, :128], r1
vld1.64 {q1}, [r2], r3
@ -298,7 +298,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
WELS_ASM_FUNC_BEGIN WelsSampleSad8x16_neon
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
@ -332,7 +332,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
@ -364,7 +364,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
stmdb sp!, {r4-r5, lr}
//Loading a horizontal line data (4 bytes)
@ -376,23 +376,23 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
//line 1
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
usada8 lr, r4, r5, lr
//line 2
//line 2
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
usada8 lr, r4, r5, lr
//line 3
ldr r4, [r0]
ldr r5, [r2]
usada8 r0, r4, r5, lr
usada8 r0, r4, r5, lr
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
stmdb sp!, {r4-r5, lr}
@ -400,30 +400,30 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q6}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d8
vabal.u8 q9, d1, d9
vabal.u8 q9, d1, d9
mov lr, #15
pixel_sad_4_16x16_loop_0:
@ -436,13 +436,13 @@ pixel_sad_4_16x16_loop_0:
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d8
vabal.u8 q9, d1, d9
@ -451,18 +451,18 @@ pixel_sad_4_16x16_loop_0:
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
@ -471,37 +471,37 @@ pixel_sad_4_16x16_loop_0:
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q6}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d8
vabal.u8 q9, d1, d9
vabal.u8 q9, d1, d9
mov lr, #7
pixel_sad_4_16x8_loop_0:
@ -514,67 +514,67 @@ pixel_sad_4_16x8_loop_0:
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d8
vabal.u8 q9, d1, d9
bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
vabdl.u8 q12, d0, d4
mov lr, #15
pixel_sad_4_8x16_loop_0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
@ -582,7 +582,7 @@ pixel_sad_4_8x16_loop_0:
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
@ -594,50 +594,50 @@ pixel_sad_4_8x16_loop_0:
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
vabdl.u8 q12, d0, d4
mov lr, #7
pixel_sad_4_8x8_loop_0:
@ -648,7 +648,7 @@ pixel_sad_4_8x8_loop_0:
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
@ -659,84 +659,84 @@ pixel_sad_4_8x8_loop_0:
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
sub r0, r2, r3
vld1.32 {d2[0]}, [r0], r3
vld1.32 {d2[1]}, [r0], r3
vld1.32 {d3[0]}, [r0], r3
vld1.32 {d3[1]}, [r0], r3
vld1.32 {d4[0]}, [r0], r3
vld1.32 {d4[1]}, [r0]
sub r0, r2, #1
vld1.32 {d4[1]}, [r0]
sub r0, r2, #1
vld1.32 {d5[0]}, [r0], r3
vld1.32 {d5[1]}, [r0], r3
vld1.32 {d6[0]}, [r0], r3
vld1.32 {d6[1]}, [r0]
add r0, r2, #1
vld1.32 {d6[1]}, [r0]
add r0, r2, #1
vld1.32 {d7[0]}, [r0], r3
vld1.32 {d7[1]}, [r0], r3
vld1.32 {d8[0]}, [r0], r3
vld1.32 {d8[1]}, [r0]
vabdl.u8 q15, d0, d2
vabdl.u8 q14, d1, d3
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d1, d4
vabdl.u8 q11, d0, d5
vabdl.u8 q10, d1, d6
vabdl.u8 q9, d0, d7
vabdl.u8 q8, d1, d8
//Save SAD to 'r4'
ldr r0, [sp]
vadd.u16 q0, q14, q15
vadd.u16 q1, q12, q13
vadd.u16 q2, q10, q11
vadd.u16 q3, q8 , q9
vadd.u16 d0, d1
vadd.u16 d1, d2, d3
vadd.u16 d2, d4, d5
vadd.u16 d3, d6, d7
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
@ -744,7 +744,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
WELS_ASM_FUNC_BEGIN WelsSampleSatd16x16_neon
SATD_16x4
vadd.u16 q15, q0, q2
@ -769,7 +769,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSatd16x8_neon
SATD_16x4
vadd.u16 q15, q0, q2
@ -786,7 +786,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
WELS_ASM_FUNC_BEGIN WelsSampleSatd8x16_neon
SATD_8x4
vadd.u16 q15, q0, q1
@ -811,7 +811,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSatd8x8_neon
SATD_8x4
vadd.u16 q15, q0, q1
@ -828,7 +828,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
//Load the pix1 data --- 16 bytes
vld1.32 {d0[0]}, [r0], r1
@ -836,11 +836,11 @@ WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
//Load the pix2 data --- 16 bytes
//Load the pix2 data --- 16 bytes
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d3[0]}, [r2], r3
vld1.32 {d3[1]}, [r2]
vld1.32 {d3[1]}, [r2]
//Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@ -861,15 +861,15 @@ WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
vtrn.16 q13, q12
vadd.s16 q15, q13, q12
//Do the SAD
vabs.s16 q15, q15
//Do the SAD
vabs.s16 q15, q15
vabd.s16 q14, q13, q12
vadd.u16 q0, q15, q14
vrhadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]

2624
codec/encoder/core/arm/reconstruct_neon.S Executable file → Normal file

File diff suppressed because it is too large Load Diff

View File

@ -110,6 +110,33 @@ int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int3
#endif//X86_ASM
#if defined (HAVE_NEON)
int32_t WelsSampleSad4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
void WelsSampleSadFour16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
void WelsSampleSadFour16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
void WelsSampleSadFour8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
void WelsSampleSadFour8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
int32_t WelsSampleSatd8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsIntra16x16Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
int32_t WelsIntra16x16Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
int32_t WelsIntra8x8Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
int32_t WelsIntra8x8Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
int32_t WelsIntra4x4Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
#endif
#if defined(__cplusplus)
}

View File

@ -482,6 +482,33 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
#endif //(X86_ASM)
#if defined (HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_neon;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon;
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_neon;
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_neon;
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_neon;
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon;
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
}
#endif
}
} // namespace WelsSVCEnc

View File

@ -231,6 +231,11 @@ void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag)
pfVar = SampleVariance16x16_sse2;
}
#endif
#ifdef HAVE_NEON
if (iCpuFlag & WELS_CPU_NEON) {
pfVar = SampleVariance16x16_neon;
}
#endif
}
void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,

View File

@ -62,6 +62,11 @@ VarFunc SampleVariance16x16_sse2;
WELSVP_EXTERN_C_END
#endif
#ifdef HAVE_NEON
WELSVP_EXTERN_C_BEGIN
VarFunc SampleVariance16x16_neon;
WELSVP_EXTERN_C_END
#endif
class CAdaptiveQuantization : public IStrategy {
public:

42
codec/processing/src/arm/adaptive_quantization.S Executable file → Normal file
View File

@ -35,7 +35,7 @@
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
.macro SQR_ADD_16BYTES
.macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
@ -51,23 +51,23 @@
#endif
WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vabd.u8 q13, q14, q15
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q10, d28, d29 //sum_cur
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
@ -78,35 +78,35 @@ pixel_var_16x16_loop0:
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vabd.u8 q2, q0, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
vpadal.u8 q13, q2
subs r4, #1
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
ldr r4, [sp, #4]
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
@ -117,4 +117,4 @@ pixel_var_16x16_loop0:
WELS_ASM_FUNC_END
#endif
#endif

138
codec/processing/src/arm/down_sample_neon.S Executable file → Normal file
View File

@ -35,29 +35,29 @@
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
lsr r5, #1
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_loop0:
comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
@ -70,9 +70,9 @@ comp_ds_bilinear_loop0:
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
vst1.32 {q0}, [r0]!
add lr, #32
cmp lr, r4
movcs lr, #0
addcs r6, r3, lsl #1
@ -82,10 +82,10 @@ comp_ds_bilinear_loop0:
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
@ -96,29 +96,29 @@ WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x8_loop0:
comp_ds_bilinear_w_x8_loop0:
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x8_loop1:
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
@ -126,7 +126,7 @@ comp_ds_bilinear_w_x8_loop1:
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
@ -137,31 +137,31 @@ WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x16_loop0:
comp_ds_bilinear_w_x16_loop0:
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x16_loop1:
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
@ -169,34 +169,34 @@ comp_ds_bilinear_w_x16_loop1:
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x32_loop0:
comp_ds_bilinear_w_x32_loop0:
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x32_loop1:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
@ -207,10 +207,10 @@ comp_ds_bilinear_w_x32_loop1:
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
@ -218,14 +218,14 @@ comp_ds_bilinear_w_x32_loop1:
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
@ -245,11 +245,11 @@ WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
mul r11, r9 // r11 -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
@ -261,13 +261,13 @@ WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
sub r11, #1
vdup.s16 d9, r11
vext.8 d7, d9, d8, #4 //init v 16384 16384 16383 16383
veor q14, q14
sub r1, r2 // stride - width
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
_HEIGHT:
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
mov r11, r8
lsr r11, #15
@ -275,8 +275,8 @@ _HEIGHT:
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r9, #16384 // xInverse
mov r9, #16384 // xInverse
sub r10, r2, #1
vmov.s16 d6, d1
@ -288,21 +288,21 @@ _WIDTH:
add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]!
add r9, r6
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
WIDTH_END:
lsr r9, #15
add r4,r11,r9
@ -317,26 +317,26 @@ WIDTH_END:
subs r3, #1
bne _HEIGHT
LAST_ROW:
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
add r4, r8 // get current row address
mov r9, #16384
_LAST_ROW_WIDTH:
mov r11, r9
lsr r11, #15
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
#endif
#endif

14
codec/processing/src/arm/pixel_sad_neon.S Executable file → Normal file
View File

@ -35,24 +35,24 @@
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
subs lr, #1
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
@ -65,4 +65,4 @@ pixel_sad_8x8_loop0:
ldmia sp!, {lr}
WELS_ASM_FUNC_END
#endif
#endif

560
codec/processing/src/arm/vaa_calc_neon.S Executable file → Normal file

File diff suppressed because it is too large Load Diff

View File

@ -75,6 +75,16 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int
}
#endif//X86_ASM
#if defined(HAVE_NEON)
if (iCpuFlag & WELS_CPU_NEON) {
sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
}
#endif
}
EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {

View File

@ -103,7 +103,20 @@ void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDst
WELSVP_EXTERN_C_END
#endif
#ifdef HAVE_NEON
WELSVP_EXTERN_C_BEGIN
// iSrcWidth no limitation
HalveDownsampleFunc DyadicBilinearDownsampler_neon;
// iSrcWidth = x32 pixels
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
void GeneralBilinearAccurateDownsampler_neon( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
WELSVP_EXTERN_C_END
#endif
class CDownsampling : public IStrategy {

View File

@ -229,4 +229,14 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr
//}
#endif //X86_ASM
#ifdef HAVE_NEON
void GeneralBilinearAccurateDownsamplerWrap_neon(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
const int32_t kiScaleBit = 15;
const uint32_t kuiScale = (1 << kiScaleBit);
uint32_t uiScalex = (uint32_t)((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
uint32_t uiScaley = (uint32_t)((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
GeneralBilinearAccurateDownsampler_neon(pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
}
#endif
WELSVP_NAMESPACE_END

View File

@ -130,6 +130,12 @@ void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad, int32_t iCpuFlag)
pfSad = WelsSampleSad8x8_sse21;
}
#endif
#ifdef HAVE_NEON
if (iCpuFlag & WELS_CPU_NEON) {
pfSad = WelsSampleSad8x8_neon;
}
#endif
}

View File

@ -60,6 +60,12 @@ SadFunc WelsSampleSad8x8_sse21;
WELSVP_EXTERN_C_END
#endif
#ifdef HAVE_NEON
WELSVP_EXTERN_C_BEGIN
SadFunc WelsSampleSad8x8_neon;
WELSVP_EXTERN_C_END
#endif
WELSVP_NAMESPACE_END
#endif

View File

@ -65,6 +65,15 @@ void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
}
#endif//X86_ASM
#ifdef HAVE_NEON
if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
sVaaFuncs.pfVAACalcSad = VAACalcSad_neon;
sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_neon;
sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_neon;
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_neon;
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_neon;
}
#endif//X86_ASM
}
EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {

View File

@ -103,6 +103,16 @@ VAACalcSadSsdFunc VAACalcSadSsd_sse2;
WELSVP_EXTERN_C_END
#endif
#ifdef HAVE_NEON
WELSVP_EXTERN_C_BEGIN
VAACalcSadBgdFunc VAACalcSadBgd_neon;
VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_neon;
VAACalcSadFunc VAACalcSad_neon;
VAACalcSadVarFunc VAACalcSadVar_neon;
VAACalcSadSsdFunc VAACalcSadSsd_neon;
WELSVP_EXTERN_C_END
#endif
class CVAACalculation : public IStrategy {
public:
CVAACalculation (int32_t iCpuFlag);