diff --git a/codec/common/deblocking_neon.S b/codec/common/deblocking_neon.S index 176c641e..23d9b183 100644 --- a/codec/common/deblocking_neon.S +++ b/codec/common/deblocking_neon.S @@ -795,7 +795,7 @@ WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon +WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon vld1.64 {d0-d2}, [r0] @@ -810,38 +810,37 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon WELS_ASM_FUNC_END #ifdef APPLE_IOS - -.macro BS_NZC_CHECK +.macro BS_NZC_CHECK vld1.8 {d0,d1}, [$0] /* Arrenge the input data --- TOP */ ands r6, $1, #2 beq bs_nzc_check_jump0 - + sub r6, $0, $2, lsl #4 sub r6, $2, lsl #3 add r6, #12 vld1.32 d3[1], [r6] - -bs_nzc_check_jump0: + +bs_nzc_check_jump0: vext.8 q1, q1, q0, #12 vadd.u8 $3, q0, q1 - + /* Arrenge the input data --- LEFT */ ands r6, $1, #1 beq bs_nzc_check_jump1 - + sub r6, $0, #21 - add r7, r6, #4 + add r7, r6, #4 vld1.8 d3[4], [r6] add r6, r7, #4 vld1.8 d3[5], [r7] add r7, r6, #4 vld1.8 d3[6], [r6] vld1.8 d3[7], [r7] - + bs_nzc_check_jump1: - vzip.8 d0, d1 + vzip.8 d0, d1 vzip.8 d0, d1 vext.8 q1, q1, q0, #12 vadd.u8 $4, q0, q1 @@ -852,41 +851,41 @@ bs_nzc_check_jump1: vabd.s16 q5, $0, $1 vabd.s16 q6, $1, $2 vdup.s16 $0, r6 - vabd.s16 q7, $2, $3 - vabd.s16 q8, $3, $4 - + vabd.s16 q7, $2, $3 + vabd.s16 q8, $3, $4 + vcge.s16 q5, $0 vcge.s16 q6, $0 vcge.s16 q7, $0 - vcge.s16 q8, $0 - + vcge.s16 q8, $0 + vpadd.i16 d10, d10, d11 vpadd.i16 d11, d12, d13 vpadd.i16 d12, d14, d15 - vpadd.i16 d13, d16, d17 - + vpadd.i16 d13, d16, d17 + vaddhn.i16 $5, q5, q5 vaddhn.i16 $6, q6, q6 .endm -.macro BS_MV_CHECK +.macro BS_MV_CHECK vldm $0, {q0,q1,q2,q3} /* Arrenge the input data --- TOP */ ands r6, $1, #2 beq bs_mv_check_jump0 - + sub r6, $0, $2, lsl #6 add r6, #48 vld1.8 {d8, d9}, [r6] - + bs_mv_check_jump0: BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4 - + /* Arrenge the input data --- LEFT */ ands r6, $1, #1 beq bs_mv_check_jump1 - + sub r6, $0, #52 add r7, r6, #16 vld1.32 d8[0], [r6] @@ -895,7 +894,7 @@ bs_mv_check_jump0: add r7, r6, #16 vld1.32 d9[0], [r6] vld1.32 d9[1], [r7] - + bs_mv_check_jump1: vzip.32 q0, q2 vzip.32 q1, q3 @@ -904,7 +903,6 @@ bs_mv_check_jump1: BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6 .endm #else - .macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4 vld1.8 {d0,d1}, [\arg0] /* Arrenge the input data --- TOP */ @@ -999,40 +997,40 @@ bs_mv_check_jump1: .endm #endif - + WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon - + stmdb sp!, {r5-r7} - + ldr r5, [sp, #12] //Save BS to r5 - + /* Checking the nzc status */ BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status - + /* For checking bS[I] = 2 */ mov r6, #2 vcgt.s8 q14, q14, #0 vdup.u8 q0, r6 vcgt.s8 q15, q15, #0 - + vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left - + /* Checking the mv status*/ BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status - + /* For checking bS[I] = 1 */ mov r6, #1 vdup.u8 q0, r6 vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left - - + + /* Check bS[I] is '1' or '2' */ vmax.u8 q1, q12, q14 vmax.u8 q0, q13, q15 - + //vstm r5, {q0, q1} vst1.32 {q0, q1}, [r5] ldmia sp!, {r5-r7} diff --git a/codec/common/expand_picture.S b/codec/common/expand_picture.S old mode 100755 new mode 100644 index a0425dfd..2a3736ab --- a/codec/common/expand_picture.S +++ b/codec/common/expand_picture.S @@ -34,13 +34,13 @@ .text #include "arm_arch_common_macro.S" - + WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon stmdb sp!, {r4-r8} //Save the dst mov r7, r0 mov r8, r3 - + add r4, r7, r2 sub r4, #1 //For the left and right expand @@ -58,40 +58,40 @@ _expand_picture_luma_loop2: subs r8, #1 bne _expand_picture_luma_loop2 - //for the top and bottom expand + //for the top and bottom expand add r2, #64 sub r0, #32 mla r4, r1, r3, r0 sub r4, r1 _expand_picture_luma_loop0: - mov r5, #32 - mls r5, r5, r1, r0 + mov r5, #32 + mls r5, r5, r1, r0 add r6, r4, r1 vld1.8 {q0}, [r0]! vld1.8 {q1}, [r4]! - + mov r8, #32 -_expand_picture_luma_loop1: - vst1.8 {q0}, [r5], r1 - vst1.8 {q1}, [r6], r1 +_expand_picture_luma_loop1: + vst1.8 {q0}, [r5], r1 + vst1.8 {q1}, [r6], r1 subs r8, #1 bne _expand_picture_luma_loop1 - + subs r2, #16 bne _expand_picture_luma_loop0 //vldreq.32 d0, [r0] - + ldmia sp!, {r4-r8} WELS_ASM_FUNC_END - + WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon stmdb sp!, {r4-r8} //Save the dst mov r7, r0 mov r8, r3 - + add r4, r7, r2 sub r4, #1 //For the left and right expand @@ -107,31 +107,31 @@ _expand_picture_chroma_loop2: subs r8, #1 bne _expand_picture_chroma_loop2 - //for the top and bottom expand + //for the top and bottom expand add r2, #32 sub r0, #16 mla r4, r1, r3, r0 sub r4, r1 _expand_picture_chroma_loop0: - mov r5, #16 - mls r5, r5, r1, r0 + mov r5, #16 + mls r5, r5, r1, r0 add r6, r4, r1 vld1.8 {q0}, [r0]! vld1.8 {q1}, [r4]! - + mov r8, #16 -_expand_picture_chroma_loop1: - vst1.8 {q0}, [r5], r1 - vst1.8 {q1}, [r6], r1 +_expand_picture_chroma_loop1: + vst1.8 {q0}, [r5], r1 + vst1.8 {q1}, [r6], r1 subs r8, #1 bne _expand_picture_chroma_loop1 - + subs r2, #16 bne _expand_picture_chroma_loop0 //vldreq.32 d0, [r0] - + ldmia sp!, {r4-r8} WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/decoder/core/arm/intra_pred_neon.S b/codec/decoder/core/arm/intra_pred_neon.S index cbac802d..3b4874e1 100644 --- a/codec/decoder/core/arm/intra_pred_neon.S +++ b/codec/decoder/core/arm/intra_pred_neon.S @@ -533,7 +533,7 @@ WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon +WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDc_neon //stmdb sp!, { r2-r5, lr} //Load the left column data (8 bytes) sub r2, r0, #1 diff --git a/codec/encoder/core/arm/intra_pred_neon.S b/codec/encoder/core/arm/intra_pred_neon.S old mode 100755 new mode 100644 index bda52a22..dab3d09f --- a/codec/encoder/core/arm/intra_pred_neon.S +++ b/codec/encoder/core/arm/intra_pred_neon.S @@ -61,25 +61,25 @@ .endm #endif - + WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon //Get the top line data to 'q0' sub r3, r1, r2 vldm r3, {d0, d1} - + //mov r2, #16 mov r3, #4 - //Set the top line to the each line of MB(16*16) + //Set the top line to the each line of MB(16*16) loop_0_get_i16x16_luma_pred_v: vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]! subs r3, #1 - bne loop_0_get_i16x16_luma_pred_v + bne loop_0_get_i16x16_luma_pred_v WELS_ASM_FUNC_END - + WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon //stmdb sp!, {r4, lr} sub r1, r1, #1 @@ -87,10 +87,10 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon loop_0_get_i16x16_luma_pred_h: //Get one byte data from left side vld1.8 {d0[],d1[]}, [r1], r2 - vld1.8 {d2[],d3[]}, [r1], r2 - vld1.8 {d4[],d5[]}, [r1], r2 + vld1.8 {d2[],d3[]}, [r1], r2 + vld1.8 {d4[],d5[]}, [r1], r2 vld1.8 {d6[],d7[]}, [r1], r2 - + //Set the line of MB using the left side byte data vst1.8 {d0,d1}, [r0]! //add r0, #16 @@ -100,9 +100,9 @@ loop_0_get_i16x16_luma_pred_h: //add r0, #16 vst1.8 {d6,d7}, [r0]! //add r0, #16 - + subs r3, #1 - bne loop_0_get_i16x16_luma_pred_h + bne loop_0_get_i16x16_luma_pred_h WELS_ASM_FUNC_END @@ -113,11 +113,11 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon sub r3, r1, #1 GET_8BYTE_DATA d0, r3, r2 GET_8BYTE_DATA d1, r3, r2 - + //Get the top horizontal line data - sub r3, r1, r2 + sub r3, r1, r2 vldm r3, {d2, d3} - + //Calculate the sum of top horizontal line data and vertical line data vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 @@ -125,11 +125,11 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon vadd.u16 d0, d0, d1 vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 - - //Calculate the mean value + + //Calculate the mean value vrshr.u16 d0, d0, #5 vdup.8 q0, d0[0] - + //Set the mean value to the all of member of MB mov r3, #4 loop_0_get_i16x16_luma_pred_dc_both: @@ -138,21 +138,21 @@ loop_0_get_i16x16_luma_pred_dc_both: vst1.8 {d0,d1}, [r0]! vst1.8 {d0,d1}, [r0]! subs r3, #1 - bne loop_0_get_i16x16_luma_pred_dc_both - + bne loop_0_get_i16x16_luma_pred_dc_both + WELS_ASM_FUNC_END //The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5} CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14 -//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0} +//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0} CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd - + WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon //stmdb sp!, { r4, lr} - + //Load the table {(8,7,6,5,4,3,2,1) * 5} adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE vldr d0, [r3] @@ -161,51 +161,51 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon sub r3, r1, r2 sub r1, r3, #1 vld1.8 d1, [r1] - + //Pack the top[8] ~ top[15] to d2 add r1, #9 vld1.8 d2, [r1] - + //Save the top[15] to d6 for next step vdup.u8 d6, d2[7] - + //Get and pack left[-1] ~ left[6] to d4 sub r1, r3, #1 GET_8BYTE_DATA d4, r1, r2 - + //Get and pack left[8] ~ left[15] to d3 add r1, r2 GET_8BYTE_DATA d3, r1, r2 - + //Save the left[15] to d7 for next step vdup.u8 d7, d3[7] - + //revert the sequence of d2,d3 vrev64.8 q1, q1 vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} - + vmovl.u8 q0, d0 vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} - + //Calculate the sum of items of q1, q2 vpadd.s16 d0, d2, d3 vpadd.s16 d1, d4, d5 vpaddl.s16 q0, q0 vpaddl.s32 q0, q0 - + //Get the value of 'b', 'c' and extend to q1, q2. vrshr.s64 q0, #6 vdup.s16 q1, d0[0] vdup.s16 q2, d1[0] - + //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE vld1.32 {d0}, [r3] - + //Get the value of 'a' and save to q3 vaddl.u8 q3, d6, d7 vshl.u16 q3, #4 @@ -214,57 +214,57 @@ WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon vmovl.s8 q0, d0 vmla.s16 q3, q0, q1 vmla.s16 q3, q2, d0[0] - + //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} vshl.s16 q5, q1, #3 vadd.s16 q5, q3 - + //right shift 5 bits and rounding vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d1, q5, #5 - + //Set the line of MB vst1.u32 {d0,d1}, [r0]! - - + + //Do the same processing for setting other lines mov r3, #15 -loop_0_get_i16x16_luma_pred_plane: +loop_0_get_i16x16_luma_pred_plane: vadd.s16 q3, q2 vadd.s16 q5, q2 vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d1, q5, #5 vst1.u32 {d0,d1}, [r0]! subs r3, #1 - bne loop_0_get_i16x16_luma_pred_plane - + bne loop_0_get_i16x16_luma_pred_plane + WELS_ASM_FUNC_END - + WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon //stmdb sp!, { r2-r5, lr} //Load the top row (4 bytes) sub r3, r1, r2 ldr r3, [r3] - + //Set the luma MB using top line str r3, [r0], #4 str r3, [r0], #4 str r3, [r0], #4 str r3, [r0] - + WELS_ASM_FUNC_END - + WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon //stmdb sp!, { r2-r5, lr} //Load the left column (4 bytes) sub r3, r1, #1 vld1.8 {d0[]}, [r3], r2 - vld1.8 {d1[]}, [r3], r2 - vld1.8 {d2[]}, [r3], r2 + vld1.8 {d1[]}, [r3], r2 + vld1.8 {d2[]}, [r3], r2 vld1.8 {d3[]}, [r3] - + //Set the luma MB using the left side byte vst1.32 {d0[0]}, [r0]! vst1.32 {d1[0]}, [r0]! @@ -279,36 +279,36 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon //Load the top row data(8 bytes) sub r3, r1, r2 vld1.32 {d0}, [r3] - + //For "t7 + (t7<<1)" vdup.8 d1, d0[7] - + //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" vext.8 d1, d0, d1, #1 vaddl.u8 q1, d1, d0 - + //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" vext.8 q2, q1, q1, #14 vadd.u16 q0, q1, q2 - + //right shift 2 bits and rounding vqrshrn.u16 d0, q0, #2 - + //Save "ddl0, ddl1, ddl2, ddl3" vext.8 d1, d0, d0, #1 vst1.32 d1[0], [r0]! - + //Save "ddl1, ddl2, ddl3, ddl4" vext.8 d1, d0, d0, #2 vst1.32 d1[0], [r0]! - + //Save "ddl2, ddl3, ddl4, ddl5" vext.8 d1, d0, d0, #3 - vst1.32 d1[0], [r0]! - + vst1.32 d1[0], [r0]! + //Save "ddl3, ddl4, ddl5, ddl6" - vst1.32 d0[1], [r0] - + vst1.32 d0[1], [r0] + WELS_ASM_FUNC_END @@ -317,29 +317,29 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon //Load the top row (4 bytes) sub r3, r1, r2 vld1.32 {d0[1]}, [r3] - + //Load the left column (5 bytes) sub r3, #1 vld1.8 {d0[3]}, [r3], r2 - vld1.8 {d0[2]}, [r3], r2 + vld1.8 {d0[2]}, [r3], r2 vld1.8 {d0[1]}, [r3], r2 - vld1.8 {d0[0]}, [r3], r2 + vld1.8 {d0[0]}, [r3], r2 vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing - - + + vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} //d2:{L3,L2,L1,L0,LT,T0,T1,T2} - + //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} vaddl.u8 q2, d2, d0 - + //q1:{TL0+LT0,LT0+T01,...L12+L23} vext.8 q3, q3, q2, #14 vadd.u16 q1, q2, q3 - + //right shift 2 bits and rounding vqrshrn.u16 d0, q1, #2 - + //Adjust the data sequence for setting luma MB of 'pred' vst1.32 d0[1], [r0]! vext.8 d0, d0, d0, #7 @@ -358,19 +358,19 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon sub r3, r1, r2 vld1.32 {d0}, [r3] - + vext.8 d1, d0, d0, #1 vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} - + vext.8 q2, q1, q1, #2 vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} - + //calculate the "vl0,vl1,vl2,vl3,vl4" vqrshrn.u16 d0, q1, #1 - + //calculate the "vl5,vl6,vl7,vl8,vl9" vqrshrn.u16 d1, q2, #2 - + //Adjust the data sequence for setting the luma MB vst1.32 d0[0], [r0]! vst1.32 d1[0], [r0]! @@ -378,7 +378,7 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon vext.8 d1, d1, d1, #1 vst1.32 d0[0], [r0]! vst1.32 d1[0], [r0] - + WELS_ASM_FUNC_END @@ -387,34 +387,34 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon //Load the top row (4 bytes) sub r3, r1, r2 vld1.32 {d0[1]}, [r3] - + //Load the left column (4 bytes) sub r3, #1 - vld1.8 {d0[3]}, [r3], r2 + vld1.8 {d0[3]}, [r3], r2 vld1.8 {d0[2]}, [r3], r2 - vld1.8 {d0[1]}, [r3], r2 - vld1.8 {d0[0]}, [r3] + vld1.8 {d0[1]}, [r3], r2 + vld1.8 {d0[0]}, [r3] + - vext.8 d1, d0, d0, #7 vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} - + vext.u8 q2, q1, q1, #14 vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} - + //Calculate the vr0 ~ vr9 vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d0, q1, #1 - + //Adjust the data sequence for setting the luma MB vst1.32 d0[1], [r0]! vst1.32 d1[1], [r0]! //add r2, r0, r1 vst1.8 d1[3], [r0]! - vst1.16 d0[2], [r0]! + vst1.16 d0[2], [r0]! vst1.8 d0[6], [r0]! vst1.8 d1[2], [r0]! - vst1.16 d1[2], [r0]! + vst1.16 d1[2], [r0]! vst1.8 d1[6], [r0] WELS_ASM_FUNC_END @@ -426,29 +426,29 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon mov r1, #3 mul r1, r2 add r1, r3 - vld1.8 {d0[]}, [r1] - vld1.8 {d0[4]}, [r3], r2 + vld1.8 {d0[]}, [r1] + vld1.8 {d0[4]}, [r3], r2 vld1.8 {d0[5]}, [r3], r2 - vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} + vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} vext.8 d1, d0, d0, #1 - vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} - + vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} + vext.u8 d2, d5, d4, #2 - vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} - + vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} + //Calculate the hu0 ~ hu5 vqrshrn.u16 d2, q2, #1 vqrshrn.u16 d1, q1, #2 - + //Adjust the data sequence for setting the luma MB vzip.8 d2, d1 vst1.32 d1[0], [r0]! - vext.8 d2, d1, d1, #2 + vext.8 d2, d1, d1, #2 vst1.32 d2[0], [r0]! vst1.32 d1[1], [r0]! vst1.32 d0[0], [r0] - + WELS_ASM_FUNC_END @@ -458,22 +458,22 @@ WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon sub r3, r1, r2 sub r3, #1 vld1.32 {d0[1]}, [r3], r2 - vld1.8 {d0[3]}, [r3], r2 + vld1.8 {d0[3]}, [r3], r2 vld1.8 {d0[2]}, [r3], r2 - vld1.8 {d0[1]}, [r3], r2 + vld1.8 {d0[1]}, [r3], r2 vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} vext.8 d1, d0, d0, #7 vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} - + vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} - + //Calculate the hd0~hd9 vqrshrn.u16 d1, q3, #2 vqrshrn.u16 d0, q2, #1 - + //Adjust the data sequence for setting the luma MB vmov d3, d1 vtrn.8 d0, d1 @@ -501,25 +501,25 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]! vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0] - + vst1.8 {d0}, [r0] + WELS_ASM_FUNC_END - + WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon //stmdb sp!, { r2-r5, lr} ////Get the left column (8 byte) sub r3, r1, #1 vld1.8 {d0[]}, [r3], r2 - vld1.8 {d1[]}, [r3], r2 - vld1.8 {d2[]}, [r3], r2 + vld1.8 {d1[]}, [r3], r2 + vld1.8 {d2[]}, [r3], r2 vld1.8 {d3[]}, [r3], r2 vld1.8 {d4[]}, [r3], r2 - vld1.8 {d5[]}, [r3], r2 - vld1.8 {d6[]}, [r3], r2 + vld1.8 {d5[]}, [r3], r2 + vld1.8 {d6[]}, [r3], r2 vld1.8 {d7[]}, [r3] - - //Set the chroma MB using left column data + + //Set the chroma MB using left column data vst1.8 {d0}, [r0]! vst1.8 {d1}, [r0]! vst1.8 {d2}, [r0]! @@ -527,8 +527,8 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon vst1.8 {d4}, [r0]! vst1.8 {d5}, [r0]! vst1.8 {d6}, [r0]! - vst1.8 {d7}, [r0] - + vst1.8 {d7}, [r0] + WELS_ASM_FUNC_END @@ -536,36 +536,36 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredDc_neon //stmdb sp!, { r2-r5, lr} //Load the left column data (8 bytes) sub r3, r1, #1 - GET_8BYTE_DATA d0, r3, r2 - + GET_8BYTE_DATA d0, r3, r2 + //Load the top row data (8 bytes) - sub r3, r1, r2 + sub r3, r1, r2 vldr d1, [r3] - + //Calculate the sum of left column and top row vpaddl.u8 q0, q0 vpaddl.u16 q0, q0 vadd.u32 d2, d0, d1 //'m1' save to d2 - - vrshr.u32 q0, q0, #2 //calculate 'm2','m3' - vrshr.u32 d2, d2, #3 //calculate 'm4' - + + vrshr.u32 q0, q0, #2 //calculate 'm2','m3' + vrshr.u32 d2, d2, #3 //calculate 'm4' + //duplicate the 'mx' to a vector line vdup.8 d4, d2[0] vdup.8 d5, d1[4] vdup.8 d6, d0[4] vdup.8 d7, d2[4] - - //Set the chroma MB + + //Set the chroma MB + vst2.32 {d4[0],d5[0]}, [r0]! vst2.32 {d4[0],d5[0]}, [r0]! vst2.32 {d4[0],d5[0]}, [r0]! - vst2.32 {d4[0],d5[0]}, [r0]! vst2.32 {d4[0],d5[0]}, [r0]! vst2.32 {d6[0],d7[0]}, [r0]! vst2.32 {d6[0],d7[0]}, [r0]! vst2.32 {d6[0],d7[0]}, [r0]! vst2.32 {d6[0],d7[0]}, [r0] - + WELS_ASM_FUNC_END @@ -579,36 +579,36 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon //Load the top row data sub r3, r1, #1 sub r3, r2 - vld1.32 {d1[0]}, [r3] + vld1.32 {d1[0]}, [r3] add r3, #5 vld1.32 {d0[0]}, [r3] - + //Load the left column data sub r3, #5 vld1.8 {d1[4]}, [r3], r2 - vld1.8 {d1[5]}, [r3], r2 + vld1.8 {d1[5]}, [r3], r2 vld1.8 {d1[6]}, [r3], r2 - vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} + vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} add r3, r2 vld1.8 {d0[4]}, [r3], r2 vld1.8 {d0[5]}, [r3], r2 vld1.8 {d0[6]}, [r3], r2 vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} - - + + //Save T7 to d3 for next step vdup.u8 d3, d0[3] //Save L7 to d4 for next step vdup.u8 d4, d0[7] - + //Calculate the value of 'a' and save to q2 vaddl.u8 q2, d3, d4 vshl.u16 q2, #4 - + //Load the table {{1,2,3,4,1,2,3,4}*17} adr r3, CONST0_GET_I_CHROMA_PRED_PLANE vld1.32 {d2}, [r3] - + //Calculate the 'b','c', and save to q0 vrev32.8 d1, d1 vsubl.u8 q0, d0, d1 @@ -617,32 +617,32 @@ WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon vpaddl.s16 q0, q0 vpaddl.s32 q0, q0 vrshr.s64 q0, #5 - + //Load the table {-3,-2,-1,0,1,2,3,4} to q3 adr r3, CONST1_GET_I_CHROMA_PRED_PLANE vld1.32 {d6, d7}, [r3] - + //Duplicate the 'b','c' to q0, q1 for SIMD instruction vdup.s16 q1, d1[0] vdup.s16 q0, d0[0] - + //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" vmla.s16 q2, q0, q3 vmla.s16 q2, q1, d6[0] vqrshrun.s16 d0, q2, #5 - + //Set a line of chroma MB vst1.u32 {d0}, [r0]! - + //Do the same processing for each line. mov r3, #7 -loop_0_get_i_chroma_pred_plane: +loop_0_get_i_chroma_pred_plane: vadd.s16 q2, q1 vqrshrun.s16 d0, q2, #5 vst1.u32 {d0}, [r0]! subs r3, #1 - bne loop_0_get_i_chroma_pred_plane - + bne loop_0_get_i_chroma_pred_plane + WELS_ASM_FUNC_END #endif diff --git a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S old mode 100755 new mode 100644 index 8cc9e7ef..6c6b6b8b --- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S +++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S @@ -29,14 +29,14 @@ * POSSIBILITY OF SUCH DAMAGE. * */ - + #ifdef HAVE_NEON .text #include "arm_arch_common_macro.S" - + #ifdef APPLE_IOS - //The data sequence will be used + //The data sequence will be used .macro GET_8BYTE_DATA_L0 vld1.8 {$0[0]}, [$1], $2 vld1.8 {$0[1]}, [$1], $2 @@ -49,7 +49,7 @@ .endm -.macro HDM_TRANSFORM_4X4_L0 +.macro HDM_TRANSFORM_4X4_L0 //Do the vertical transform vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13} @@ -57,15 +57,15 @@ vswp d1, d2 vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} - + //Do the horizontal transform vtrn.32 q2, q1 vadd.s16 q0, q2, q1 vsub.s16 q1, q2, q1 - + vtrn.16 q0, q1 vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 + vsub.s16 q1, q0, q1 vmov.s16 d0, d4 vmov.s16 d1, d2 @@ -76,9 +76,9 @@ vtrn.32 d0, d1 //{0,1,3,2} vaba.s16 $5, d0, $2 //16x16_v vaba.s16 $5, d1, $8 - vaba.s16 $5, d5, $8 + vaba.s16 $5, d5, $8 vadd.u16 $5, d3 - + //16x16_h vtrn.16 d4, d5 //{0,4,12,8} vaba.s16 $6, d4, $3 //16x16_h @@ -87,7 +87,7 @@ vadd.u16 d2, d3 vadd.u16 d2, d5 vadd.u16 $6, d2 - + //16x16_dc_both vaba.s16 $7, d4, $4 //16x16_dc_both vadd.u16 $7, d2 @@ -95,7 +95,7 @@ .endm #else - //The data sequence will be used + //The data sequence will be used .macro GET_8BYTE_DATA_L0 arg0, arg1, arg2 vld1.8 {\arg0[0]}, [\arg1], \arg2 vld1.8 {\arg0[1]}, [\arg1], \arg2 @@ -115,15 +115,15 @@ vswp d1, d2 vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} - + //Do the horizontal transform vtrn.32 q2, q1 vadd.s16 q0, q2, q1 vsub.s16 q1, q2, q1 - + vtrn.16 q0, q1 vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 + vsub.s16 q1, q0, q1 vmov.s16 d0, d4 vmov.s16 d1, d2 @@ -134,9 +134,9 @@ vtrn.32 d0, d1 //{0,1,3,2} vaba.s16 \arg5, d0, \arg2 //16x16_v vaba.s16 \arg5, d1, \arg8 - vaba.s16 \arg5, d5, \arg8 + vaba.s16 \arg5, d5, \arg8 vadd.u16 \arg5, d3 - + //16x16_h vtrn.16 d4, d5 //{0,4,12,8} vaba.s16 \arg6, d4, \arg3 //16x16_h @@ -145,42 +145,42 @@ vadd.u16 d2, d3 vadd.u16 d2, d5 vadd.u16 \arg6, d2 - + //16x16_dc_both vaba.s16 \arg7, d4, \arg4 //16x16_dc_both vadd.u16 \arg7, d2 .endm #endif -WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon +WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon stmdb sp!, {r4-r7, lr} //Get the top line data to 'q15'(16 bytes) sub r7, r0, r1 vld1.8 {q15}, [r7] - + //Get the left colume data to 'q14' (16 bytes) sub r7, r0, #1 GET_8BYTE_DATA_L0 d28, r7, r1 - GET_8BYTE_DATA_L0 d29, r7, r1 - + GET_8BYTE_DATA_L0 d29, r7, r1 + //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes) - //Calculate the 16x16_dc_both mode SATD + //Calculate the 16x16_dc_both mode SATD vaddl.u8 q0, d30, d31 vaddl.u8 q1, d28, d29 vadd.u16 q0, q1 vadd.u16 d0, d1 vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 - - //Calculate the mean value + + //Calculate the mean value vrshr.u16 d0, #5 - vshl.u16 d27, d0, #4 - - + vshl.u16 d27, d0, #4 + + //Calculate the 16x16_v mode SATD and save to "q11, 12" vshll.u8 q0, d30, #2 - vshll.u8 q1, d31, #2 + vshll.u8 q1, d31, #2 vtrn.32 q0, q1 vadd.s16 q2, q0, q1 vsub.s16 q1, q0, q1 @@ -191,7 +191,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon //{8,9,11,10, 12,13,15,14} q11 //Calculate the 16x16_h mode SATD and save to "q9, q10" vshll.u8 q0, d28, #2 - vshll.u8 q1, d29, #2 + vshll.u8 q1, d29, #2 vtrn.32 q0, q1 vadd.s16 q2, q0, q1 vsub.s16 q1, q0, q1 @@ -199,64 +199,64 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon vadd.s16 q10, q2, q1 vsub.s16 q9, q2, q1 vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10 - //{8,9,11,10, 12,13,15,14} q9 - + //{8,9,11,10, 12,13,15,14} q9 + vmov.i32 d17, #0//Save the SATD of DC_BOTH vmov.i32 d16, #0//Save the SATD of H vmov.i32 d15, #0//Save the SATD of V vmov.i32 d14, #0//For zero D register - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes vld1.32 {q3}, [r2], r3 vld1.32 {q4}, [r2], r3 vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 + vld1.32 {q6}, [r2], r3 vtrn.32 q3, q4 - vtrn.32 q5, q6 - - HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14 - HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14 + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14 - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes vld1.32 {q3}, [r2], r3 vld1.32 {q4}, [r2], r3 vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 + vld1.32 {q6}, [r2], r3 vtrn.32 q3, q4 - vtrn.32 q5, q6 - - HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14 - HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14 - - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14 + + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes vld1.32 {q3}, [r2], r3 vld1.32 {q4}, [r2], r3 vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 + vld1.32 {q6}, [r2], r3 vtrn.32 q3, q4 - vtrn.32 q5, q6 - - HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14 - HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14 - - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14 + + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes vld1.32 {q3}, [r2], r3 vld1.32 {q4}, [r2], r3 vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 + vld1.32 {q6}, [r2], r3 vtrn.32 q3, q4 - vtrn.32 q5, q6 - - HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14 + vtrn.32 q5, q6 + + HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14 - HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14 - + HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14 + //Get the data from stack ldr r5, [sp, #20] //the addr of Best_mode ldr r6, [sp, #24] //the value of i_lambda @@ -266,19 +266,19 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon vpaddl.u16 d15, d15 vpaddl.u32 d15, d15 vmov.u32 r0, d15[0] - + //vadd.u16 d22, d23 vrshr.u16 d16, #1 vpaddl.u16 d16, d16 vpaddl.u32 d16, d16 - vmov.u32 r1, d16[0] + vmov.u32 r1, d16[0] add r1, r6, lsl #1 - + //vadd.u16 d20, d21 vrshr.u16 d17, #1 vpaddl.u16 d17, d17 vpaddl.u32 d17, d17 - vmov.u32 r2, d17[0] + vmov.u32 r2, d17[0] add r2, r6, lsl #1 mov r4, #0 @@ -295,60 +295,60 @@ WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon +WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon stmdb sp!, {r4-r7, lr} - + //Get the top line data to 'q15'(16 bytes) sub r4, r0, r1 vld1.8 {q15}, [r4] - + //Get the left colume data to 'q14' (16 bytes) sub r4, r0, #1 GET_8BYTE_DATA_L0 d28, r4, r1 - GET_8BYTE_DATA_L0 d29, r4, r1 - + GET_8BYTE_DATA_L0 d29, r4, r1 + //Calculate the mean value and save to 'q13' (8 bytes) - //Calculate the 16x16_dc_both mode SATD + //Calculate the 16x16_dc_both mode SATD vaddl.u8 q0, d30, d31 vaddl.u8 q1, d28, d29 vadd.u16 q0, q1 vadd.u16 d0, d1 vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 - - //Calculate the mean value + + //Calculate the mean value vrshr.u16 d0, d0, #5 vdup.8 q13, d0[0] - + sub r4, r0, #1 - + vmov.i32 q12, #0//Save the SATD of DC_BOTH vmov.i32 q11, #0//Save the SATD of H vmov.i32 q10, #0//Save the SATD of V - + mov lr, #16 sad_intra_16x16_x3_opt_loop0: //Get the left colume data to 'd0' (16 bytes) - vld1.8 {d0[]}, [r4], r1 + vld1.8 {d0[]}, [r4], r1 - //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes vld1.8 {q1}, [r2], r3 - + subs lr, #1 //Do the SAD for top colume vabal.u8 q12, d30, d2 - vabal.u8 q12, d31, d3 + vabal.u8 q12, d31, d3 //Do the SAD for left colume vabal.u8 q11, d0, d2 - vabal.u8 q11, d0, d3 + vabal.u8 q11, d0, d3 //Do the SAD for mean value vabal.u8 q10, d26, d2 - vabal.u8 q10, d26, d3 - + vabal.u8 q10, d26, d3 + bne sad_intra_16x16_x3_opt_loop0 - + //Get the data from stack ldr r5, [sp, #20] //the addr of Best_mode ldr r6, [sp, #24] //the value of i_lambda @@ -357,19 +357,19 @@ sad_intra_16x16_x3_opt_loop0: vpaddl.u16 d24, d24 vpaddl.u32 d24, d24 vmov.u32 r0, d24[0] - + vadd.u16 d22, d23 vpaddl.u16 d22, d22 vpaddl.u32 d22, d22 - vmov.u32 r1, d22[0] + vmov.u32 r1, d22[0] add r1, r6, lsl #1 - + vadd.u16 d20, d21 vpaddl.u16 d20, d20 vpaddl.u32 d20, d20 - vmov.u32 r2, d20[0] + vmov.u32 r2, d20[0] add r2, r6, lsl #1 - + mov r4, #0 cmp r1, r0 movcc r0, r1 @@ -384,120 +384,120 @@ sad_intra_16x16_x3_opt_loop0: WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon +WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon stmdb sp!, {r4-r7, lr} - + //Get the data from stack ldr r4, [sp, #32] //p_dec_cr ldr r5, [sp, #36] //p_enc_cr - + //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) sub r6, r0, #1 GET_8BYTE_DATA_L0 d28, r6, r1 - sub r6, r4, #1 - GET_8BYTE_DATA_L0 d30, r6, r1 - + sub r6, r4, #1 + GET_8BYTE_DATA_L0 d30, r6, r1 + //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) sub r6, r0, r1 vld1.8 {d29}, [r6] sub r6, r4, r1 vld1.8 {d31}, [r6] - + //Calculate the sum of left column and top row vmov.i32 q0, q14 vpaddl.u8 q0, q0 vpaddl.u16 q0, q0 vadd.u32 d2, d0, d1 //'m1' save to d2 - vrshr.u32 q0, q0, #2 //calculate 'm2','m3' - vrshr.u32 d2, d2, #3 //calculate 'm4' - - //duplicate the 'mx' to a vector line + vrshr.u32 q0, q0, #2 //calculate 'm2','m3' + vrshr.u32 d2, d2, #3 //calculate 'm4' + + //duplicate the 'mx' to a vector line vdup.8 d27, d2[0] vdup.8 d26, d1[4] vtrn.32 d27, d26 - + vdup.8 d26, d0[4] vdup.8 d25, d2[4] vtrn.32 d26, d25 //Save to "d27, d26" - + vmov.i32 q0, q15 vpaddl.u8 q0, q0 vpaddl.u16 q0, q0 vadd.u32 d2, d0, d1 //'m1' save to d2 - vrshr.u32 q0, q0, #2 //calculate 'm2','m3' - vrshr.u32 d2, d2, #3 //calculate 'm4' - + vrshr.u32 q0, q0, #2 //calculate 'm2','m3' + vrshr.u32 d2, d2, #3 //calculate 'm4' + //duplicate the 'mx' to a vector line vdup.8 d25, d2[0] vdup.8 d24, d1[4] vtrn.32 d25, d24 - + vdup.8 d24, d0[4] vdup.8 d23, d2[4] vtrn.32 d24, d23 //Save to "d25, d24" - + vmov.i32 q11, #0//Save the SATD of DC_BOTH vmov.i32 q10, #0//Save the SATD of H vmov.i32 q9 , #0//Save the SATD of V sub r6, r0, #1 - sub r7, r4, #1 + sub r7, r4, #1 mov lr, #4 sad_intra_8x8_x3_opt_loop0: - //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes vld1.8 {d0}, [r2], r3 vld1.8 {d1}, [r5], r3 - + //Get the left colume data to 'd0' (16 bytes) - vld1.8 {d2[]}, [r6], r1 - vld1.8 {d3[]}, [r7], r1 - + vld1.8 {d2[]}, [r6], r1 + vld1.8 {d3[]}, [r7], r1 + subs lr, #1 - + //Do the SAD for top colume - vabal.u8 q11, d29, d0 - vabal.u8 q11, d31, d1 + vabal.u8 q11, d29, d0 + vabal.u8 q11, d31, d1 //Do the SAD for left colume vabal.u8 q10, d2, d0 - vabal.u8 q10, d3, d1 + vabal.u8 q10, d3, d1 //Do the SAD for mean value vabal.u8 q9, d27, d0 - vabal.u8 q9, d25, d1 - - + vabal.u8 q9, d25, d1 + + bne sad_intra_8x8_x3_opt_loop0 mov lr, #4 sad_intra_8x8_x3_opt_loop1: - //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes vld1.8 {d0}, [r2], r3 vld1.8 {d1}, [r5], r3 - + //Get the left colume data to 'd0' (16 bytes) - vld1.8 {d2[]}, [r6], r1 - vld1.8 {d3[]}, [r7], r1 - + vld1.8 {d2[]}, [r6], r1 + vld1.8 {d3[]}, [r7], r1 + subs lr, #1 - + //Do the SAD for top colume - vabal.u8 q11, d29, d0 - vabal.u8 q11, d31, d1 + vabal.u8 q11, d29, d0 + vabal.u8 q11, d31, d1 //Do the SAD for left colume vabal.u8 q10, d2, d0 - vabal.u8 q10, d3, d1 + vabal.u8 q10, d3, d1 //Do the SAD for mean value vabal.u8 q9, d26, d0 - vabal.u8 q9, d24, d1 - - - bne sad_intra_8x8_x3_opt_loop1 + vabal.u8 q9, d24, d1 + + + bne sad_intra_8x8_x3_opt_loop1 //Get the data from stack ldr r5, [sp, #20] //the addr of Best_mode ldr r6, [sp, #24] //the value of i_lambda @@ -505,13 +505,13 @@ sad_intra_8x8_x3_opt_loop1: vadd.u16 d22, d23 vpaddl.u16 d22, d22 vpaddl.u32 d22, d22 - vmov.u32 r0, d22[0] + vmov.u32 r0, d22[0] add r0, r6, lsl #1 - + vadd.u16 d20, d21 vpaddl.u16 d20, d20 vpaddl.u32 d20, d20 - vmov.u32 r1, d20[0] + vmov.u32 r1, d20[0] add r1, r6, lsl #1 vadd.u16 d18, d19 @@ -533,28 +533,28 @@ sad_intra_8x8_x3_opt_loop1: WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon +WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon stmdb sp!, {r4-r7, lr} - + //Get the data from stack ldr r4, [sp, #32] //p_dec_cr ldr r5, [sp, #36] //p_enc_cr - + //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) sub r6, r0, r1 vld1.8 {d29}, [r6] sub r6, r4, r1 vld1.8 {d31}, [r6] - + //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) sub r6, r0, #1 GET_8BYTE_DATA_L0 d28, r6, r1 - sub r6, r4, #1 - GET_8BYTE_DATA_L0 d30, r6, r1 - + sub r6, r4, #1 + GET_8BYTE_DATA_L0 d30, r6, r1 + //Calculate the 16x16_v mode SATD and save to "q12, 13" vshll.u8 q0, d29, #2 - vshll.u8 q1, d31, #2 + vshll.u8 q1, d31, #2 vtrn.32 q0, q1 vadd.s16 q2, q0, q1 vsub.s16 q1, q0, q1 @@ -565,7 +565,7 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon //{8,9,11,10, 12,13,15,14} q12 //Calculate the 16x16_h mode SATD and save to "q10, q11" vshll.u8 q0, d28, #2 - vshll.u8 q1, d30, #2 + vshll.u8 q1, d30, #2 vtrn.32 q0, q1 vadd.s16 q2, q0, q1 vsub.s16 q1, q0, q1 @@ -573,69 +573,69 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon vadd.s16 q11, q2, q1 vsub.s16 q10, q2, q1 vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11 - //{8,9,11,10, 12,13,15,14} q10 - + //{8,9,11,10, 12,13,15,14} q10 + //Calculate the sum of left column and top row //vmov.i32 q0, q14 vpaddl.u8 q0, q14 vpaddl.u16 q0, q0 - vadd.u32 d2, d0, d1 + vadd.u32 d2, d0, d1 vpaddl.u8 q2, q15 vpaddl.u16 q2, q2 - vadd.u32 d3, d4, d5 - + vadd.u32 d3, d4, d5 + vtrn.32 q0, q2 vrshr.u32 q1, #3 - vrshr.u32 q2, #2 + vrshr.u32 q2, #2 vshll.u32 q9, d4, #4 // {2cb, 2cr} q9 vshll.u32 q8, d5, #4 // {1cb, 1cr} q8 vshll.u32 q7, d2, #4 // {0cb, 3cb} q7 vshll.u32 q6, d3, #4 // {0cr, 3cr} q6 - - + + vmov.i32 d28, #0//Save the SATD of DC_BOTH vmov.i32 d10, #0//Save the SATD of H vmov.i32 d11, #0//Save the SATD of V vmov.i32 d30, #0//For zero D register - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes vld1.32 {d6}, [r2], r3 vld1.32 {d7}, [r2], r3 vld1.32 {d8}, [r2], r3 - vld1.32 {d9}, [r2], r3 + vld1.32 {d9}, [r2], r3 vtrn.32 d6, d7 - vtrn.32 d8, d9 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30 HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30 - + vld1.32 {d6}, [r5], r3 vld1.32 {d7}, [r5], r3 vld1.32 {d8}, [r5], r3 - vld1.32 {d9}, [r5], r3 + vld1.32 {d9}, [r5], r3 vtrn.32 d6, d7 - vtrn.32 d8, d9 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30 - HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30 + HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30 - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes vld1.32 {d6}, [r2], r3 vld1.32 {d7}, [r2], r3 vld1.32 {d8}, [r2], r3 - vld1.32 {d9}, [r2], r3 + vld1.32 {d9}, [r2], r3 vtrn.32 d6, d7 - vtrn.32 d8, d9 - HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30 + vtrn.32 d8, d9 + HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30 HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30 - + vld1.32 {d6}, [r5], r3 vld1.32 {d7}, [r5], r3 vld1.32 {d8}, [r5], r3 - vld1.32 {d9}, [r5], r3 + vld1.32 {d9}, [r5], r3 vtrn.32 d6, d7 - vtrn.32 d8, d9 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30 - HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30 - + HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30 + //Get the data from stack ldr r5, [sp, #20] //the addr of Best_mode ldr r6, [sp, #24] //the value of i_lambda @@ -643,13 +643,13 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon vrshr.u16 d11, #1 vpaddl.u16 d11, d11 vpaddl.u32 d11, d11 - vmov.u32 lr, d11[0] + vmov.u32 lr, d11[0] add lr, r6, lsl #1 - + vrshr.u16 d10, #1 vpaddl.u16 d10, d10 vpaddl.u32 d10, d10 - vmov.u32 r3, d10[0] + vmov.u32 r3, d10[0] add r3, r6, lsl #1 vrshr.u16 d28, #1 @@ -672,31 +672,31 @@ WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon +WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon stmdb sp!, {r4-r7, lr} //Get the top line data to 'd31[0~3]'(4 bytes) sub r7, r0, r1 vld1.32 {d31[0]}, [r7] - + //Get the left colume data to 'd31[4~7]' (4 bytes) sub r7, r0, #1 vld1.8 {d31[4]}, [r7], r1 vld1.8 {d31[5]}, [r7], r1 vld1.8 {d31[6]}, [r7], r1 vld1.8 {d31[7]}, [r7], r1 - + //Calculate the mean value and save to 'd30' (2 bytes) vpaddl.u8 d0, d31 vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - //Calculate the mean value + vpaddl.u32 d0, d0 + //Calculate the mean value vrshr.u16 d0, #3 - vshl.u16 d30, d0, #4 - + vshl.u16 d30, d0, #4 + //Calculate the 16x16_v mode SATD and save to "d29" - //Calculate the 16x16_h mode SATD and save to "d28" - vshll.u8 q0, d31, #2 + //Calculate the 16x16_h mode SATD and save to "d28" + vshll.u8 q0, d31, #2 vtrn.32 d0, d1 vadd.s16 d2, d0, d1 vsub.s16 d1, d0, d1 @@ -710,12 +710,12 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon vmov.i32 d26, #0//Save the SATD of H vmov.i32 d25, #0//Save the SATD of V vmov.i32 d24, #0//For zero D register - - //Load the p_enc data and save to "d22,d23"--- 4X4 bytes + + //Load the p_enc data and save to "d22,d23"--- 4X4 bytes vld1.32 {d23[0]}, [r2], r3 vld1.32 {d23[1]}, [r2], r3 vld1.32 {d22[0]}, [r2], r3 - vld1.32 {d22[1]}, [r2], r3 + vld1.32 {d22[1]}, [r2], r3 HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24 @@ -723,17 +723,17 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon ldr r5, [sp, #28] //the value of lambda2 ldr r6, [sp, #32] //the value of lambda1 ldr r7, [sp, #36] //the value of lambda0 - + vrshr.u16 d25, #1 vpaddl.u16 d25, d25 vpaddl.u32 d25, d25 - vmov.u32 r0, d25[0] + vmov.u32 r0, d25[0] add r0, r7 - + vrshr.u16 d26, #1 vpaddl.u16 d26, d26 vpaddl.u32 d26, d26 - vmov.u32 r1, d26[0] + vmov.u32 r1, d26[0] add r1, r6 vrshr.u16 d27, #1 @@ -741,10 +741,10 @@ WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon vpaddl.u32 d27, d27 vmov.u32 r2, d27[0] add r2, r5 - + ldr r5, [sp, #20] //p_dst - ldr r6, [sp, #24] //the addr of Best_mode - + ldr r6, [sp, #24] //the addr of Best_mode + mov r4, r0 cmp r1, r4 movcc r4, r1 @@ -770,8 +770,8 @@ satd_intra_4x4_x3_opt_jump0: vdup.8 d0, d31[4] vdup.8 d1, d31[5] vdup.8 d2, d31[6] - vdup.8 d3, d31[7] - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5] + vdup.8 d3, d31[7] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5] bl satd_intra_4x4_x3_opt_end satd_intra_4x4_x3_opt_jump1: @@ -783,11 +783,11 @@ satd_intra_4x4_x3_opt_jump1: vst1.32 {d31[0]}, [r5]! vst1.32 {d31[0]}, [r5]! - + satd_intra_4x4_x3_opt_end: - mov r0, r4 - + mov r0, r4 + ldmia sp!, {r4-r7, lr} WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/encoder/core/arm/mc_neon.S b/codec/encoder/core/arm/mc_neon.S old mode 100755 new mode 100644 index c81940d8..0c1b5d25 --- a/codec/encoder/core/arm/mc_neon.S +++ b/codec/encoder/core/arm/mc_neon.S @@ -1,1963 +1,1963 @@ -/*! - * \copy - * Copyright (c) 2013, Cisco Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifdef HAVE_NEON -.text -#include "arm_arch_common_macro.S" - -#ifdef APPLE_IOS -.macro AVERAGE_TWO_8BITS -// { // input:dst_d, src_d A and B; working: q13 - vaddl.u8 q13, $2, $1 - vrshrn.u16 $0, q13, #1 -// } -.endm - -.macro FILTER_6TAG_8BITS -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; - vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 $6, q12, #5 -// } -.endm - -.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used -// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}, - vrev64.8 $2, $0 // X[5][4][3][2][1][0]O - vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]* - vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32] - vpadd.s16 $0, $0, $0 - vpadd.s16 $0, $0, $0 - vqrshrun.s16 $0, $4, #5 -// } -.endm - -.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; - vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 $6, q12, #5 - vaddl.u8 q13, $2, $6 - vrshrn.u16 $6, q13, #1 -// } -.endm - -.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; - vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 $6, q12, #5 - vaddl.u8 q13, $3, $6 - vrshrn.u16 $6, q13, #1 -// } -.endm - -.macro FILTER_6TAG_8BITS_TO_16BITS -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, - vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } -.endm - -.macro FILTER_3_IN_16BITS_TO_8BITS -// { // input:a, b, c, dst_d; - vsub.s16 $0, $0, $1 //a-b - vshr.s16 $0, $0, #2 //(a-b)/4 - vsub.s16 $0, $0, $1 //(a-b)/4-b - vadd.s16 $0, $0, $2 //(a-b)/4-b+c - vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 - vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 $3, $0, #6 //(+32)>>6 -// } -.endm - -.macro UNPACK_2_16BITS_TO_ABC -// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a - vext.16 $4, $0, $1, #2 //src[0] - vext.16 $3, $0, $1, #3 //src[1] - vadd.s16 $4, $3 //c=src[0]+src[1] - - vext.16 $3, $0, $1, #1 //src[-1] - vext.16 $2, $0, $1, #4 //src[2] - vadd.s16 $3, $2 //b=src[-1]+src[2] - - vext.16 $2, $0, $1, #5 //src[3] - vadd.s16 $2, $0 //a=src[-2]+src[3] -// } -.endm - -.macro UNPACK_1_IN_8x16BITS_TO_8BITS -// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) - vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5], - vrev64.16 $1, $1 - vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5], - vshr.s64 $1, $2, #16 - vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0 - - vsub.s16 $0, $0, $1 //a-b - vshr.s16 $0, $0, #2 //(a-b)/4 - vsub.s16 $0, $0, $1 //(a-b)/4-b - vadd.s16 $0, $0, $2 //(a-b)/4-b+c - vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 - vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 $0, $3, #6 //(+32)>>6 -// } -.endm -#else -.macro AVERAGE_TWO_8BITS arg0, arg1,arg2 -// { // input:dst_d, src_d A and B; working: q13 - vaddl.u8 q13, \arg2, \arg1 - vrshrn.u16 \arg0, q13, #1 -// } -.endm - -.macro FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b - vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 \arg6, q12, #5 -// } -.endm - -.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used -// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2} - vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O - vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]* - vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32] - vpadd.s16 \arg0, \arg0, \arg0 - vpadd.s16 \arg0, \arg0, \arg0 - vqrshrun.s16 \arg0, \arg4, #5 -// } -.endm - -.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d - vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 \arg6, q12, #5 - vaddl.u8 q13, \arg2, \arg6 - vrshrn.u16 \arg6, q13, #1 -// } -.endm - -.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d - vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 \arg6, q12, #5 - vaddl.u8 q13, \arg3, \arg6 - vrshrn.u16 \arg6, q13, #1 -// } -.endm - -.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3] - vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } -.endm - -.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3 -// { // input:a, b, c, dst_d; - vsub.s16 \arg0, \arg0, \arg1 //a-b - vshr.s16 \arg0, \arg0, #2 //(a-b)/4 - vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b - vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c - vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 - vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6 -// } -.endm - -.macro UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4 -// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5) - vext.16 \arg4, \arg0, \arg1, #2 //src[0] - vext.16 \arg3, \arg0, \arg1, #3 //src[1] - vadd.s16 \arg4, \arg3 //c=src[0]+src[1] - - vext.16 \arg3, \arg0, \arg1, #1 //src[-1] - vext.16 \arg2, \arg0, \arg1, #4 //src[2] - vadd.s16 \arg3, \arg2 //b=src[-1]+src[2] - - vext.16 \arg2, \arg0, \arg1, #5 //src[3] - vadd.s16 \arg2, \arg0 //a=src[-2]+src[3] -// } -.endm - -.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3 -// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) - vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5] - vrev64.16 \arg1, \arg1 - vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5] - vshr.s64 \arg1, \arg2, #16 - vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0 - - vsub.s16 \arg0, \arg0, \arg1 //a-b - vshr.s16 \arg0, \arg0, #2 //(a-b)/4 - vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b - vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c - vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 - vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6 -// } -.endm -#endif - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w16_h_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] - pld [r0] - pld [r0, #16] - - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q4, q0, q1, #3 //q4=src[1] - vext.8 q5, q0, q1, #4 //q5=src[2] - vext.8 q6, q0, q1, #5 //q6=src[3] - - FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15 - - FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15 - - sub r4, #1 - vst1.u8 {d2, d3}, [r2], r3 //write 16Byte - - cmp r4, #0 - bne w16_h_mc_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon - push {r4-r5} - mov r4, #20 - mov r5, #1 - sub r4, r4, r4, lsl #(16-2) - lsl r5, #16 - ror r4, #16 - vmov d3, r5, r4 // 0x0014FFFB00010000 - - sub r3, #16 - ldr r4, [sp, #8] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w17_h_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2] - - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q4, q0, q1, #3 //q4=src[1] - vext.8 q5, q0, q1, #4 //q5=src[2] - vext.8 q6, q0, q1, #5 //q6=src[3] - - FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15 - - FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15 - - vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte - - vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X - FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1 - - vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte - - sub r4, #1 - cmp r4, #0 - bne w17_h_mc_luma_loop - pop {r4-r5} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon - push {r4-r5} - mov r4, #20 - mov r5, #1 - sub r4, r4, r4, lsl #(16-2) - lsl r5, #16 - ror r4, #16 - vmov d7, r5, r4 // 0x0014FFFB00010000 - - sub r3, #8 - ldr r4, [sp, #8] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w9_h_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2] - pld [r0] - - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] - - FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15 - - sub r4, #1 - vst1.u8 {d8}, [r2]! //write [0:7] Byte - - vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X - FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1 - vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte - - cmp r4, #0 - bne w9_h_mc_luma_loop - pop {r4-r5} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon - push {r4, r5, r6} - ldr r6, [sp, #12] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w4_h_mc_luma_loop: - vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] - pld [r0] - vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] - pld [r0] - - vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] - vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] - vext.8 q3, q2, q2, #1 //src[0:6 *] - vext.8 q4, q2, q2, #2 //src[1:6 * *] - - vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] - vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] - vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] - vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] - - FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15 - - vmov r4, r5, d1 - str r4, [r2], r3 - str r5, [r2], r3 - - sub r6, #2 - cmp r6, #0 - bne w4_h_mc_luma_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w16_xy_10_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] - pld [r0] - pld [r0, #16] - - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q4, q0, q1, #3 //q4=src[1] - vext.8 q5, q0, q1, #4 //q5=src[2] - vext.8 q6, q0, q1, #5 //q6=src[3] - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15 - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15 - - sub r4, #1 - vst1.u8 {d2, d3}, [r2], r3 //write 16Byte - - cmp r4, #0 - bne w16_xy_10_mc_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w8_xy_10_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] - pld [r0] - - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15 - - sub r4, #1 - vst1.u8 {d1}, [r2], r3 - - cmp r4, #0 - bne w8_xy_10_mc_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon - push {r4, r5, r6} - ldr r6, [sp, #12] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w4_xy_10_mc_luma_loop: - vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] - pld [r0] - vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] - pld [r0] - - vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] - vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] - vext.8 q3, q2, q2, #1 //src[0:6 *] - vext.8 q4, q2, q2, #2 //src[1:6 * *] - - vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] - vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] - vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] - vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15 - - vmov r4, r5, d1 - str r4, [r2], r3 - str r5, [r2], r3 - - sub r6, #2 - cmp r6, #0 - bne w4_xy_10_mc_luma_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w16_xy_30_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] - pld [r0] - pld [r0, #16] - - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q4, q0, q1, #3 //q4=src[1] - vext.8 q5, q0, q1, #4 //q5=src[2] - vext.8 q6, q0, q1, #5 //q6=src[3] - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15 - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15 - - sub r4, #1 - vst1.u8 {d2, d3}, [r2], r3 //write 16Byte - - cmp r4, #0 - bne w16_xy_30_mc_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w8_xy_30_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] - pld [r0] - - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15 - - sub r4, #1 - vst1.u8 {d1}, [r2], r3 - - cmp r4, #0 - bne w8_xy_30_mc_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon - push {r4, r5, r6} - ldr r6, [sp, #12] - - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 - -w4_xy_30_mc_luma_loop: - vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] - pld [r0] - vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] - pld [r0] - - vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] - vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] - vext.8 q3, q2, q2, #1 //src[0:6 *] - vext.8 q4, q2, q2, #2 //src[1:6 * *] - - vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] - vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] - vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] - vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15 - - vmov r4, r5, d1 - str r4, [r2], r3 - str r5, [r2], r3 - - sub r6, #2 - cmp r6, #0 - bne w4_xy_30_mc_luma_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q4}, [r0], r1 //q4=src[2] - -w16_xy_01_luma_loop: - - vld1.u8 {q5}, [r0], r1 //q5=src[3] - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q6}, [r2], r3 //write 1st 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q6}, [r2], r3 //write 4th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15 - vld1.u8 {q4}, [r0], r1 //read 6th row - vst1.u8 {q6}, [r2], r3 //write 5th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15 - vld1.u8 {q5}, [r0], r1 //read 7th row - vst1.u8 {q6}, [r2], r3 //write 6th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q6}, [r2], r3 //write 7th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15 - vst1.u8 {q6}, [r2], r3 //write 8th 16Byte - - //q2, q3, q4, q5, q0 --> q0~q4 - vswp q0, q4 - vswp q0, q2 - vmov q1, q3 - vmov q3, q5 //q0~q4 - - sub r4, #8 - cmp r4, #0 - bne w16_xy_01_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] - - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] - -w8_xy_01_mc_luma_loop: - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d12}, [r2], r3 //write 1st 8Byte - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d12}, [r2], r3 //write 4th 8Byte - - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 - - sub r4, #4 - cmp r4, #0 - bne w8_xy_01_mc_luma_loop - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon - push {r4, r5, r6, r7} - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - ldr r4, [r0], r1 //r4=src[-2] - ldr r5, [r0], r1 //r5=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - ldr r6, [r0], r1 //r6=src[0] - ldr r7, [r0], r1 //r7=src[1] - - vmov d0, r4, r5 - vmov d1, r5, r6 - vmov d2, r6, r7 - - ldr r4, [r0], r1 //r4=src[2] - vmov d3, r7, r4 - ldr r7, [sp, #16] - -w4_xy_01_mc_luma_loop: - - //using reserving r4 - ldr r5, [r0], r1 //r5=src[3] - ldr r6, [r0], r1 //r6=src[0] - vmov d4, r4, r5 - vmov d5, r5, r6 //reserved r6 - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15 - vmov r4, r5, d12 - str r4, [r2], r3 //write 1st 4Byte - str r5, [r2], r3 //write 2nd 4Byte - - ldr r5, [r0], r1 //r5=src[1] - ldr r4, [r0], r1 //r4=src[2] - vmov d0, r6, r5 - vmov d1, r5, r4 //reserved r4 - - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15 - vmov r5, r6, d12 - str r5, [r2], r3 //write 3rd 4Byte - str r6, [r2], r3 //write 4th 4Byte - - //d4, d5, d0, d1 --> d0, d1, d2, d3 - vmov q1, q0 - vmov q0, q2 - - sub r7, #4 - cmp r7, #0 - bne w4_xy_01_mc_luma_loop - - pop {r4, r5, r6, r7} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q4}, [r0], r1 //q4=src[2] - -w16_xy_03_luma_loop: - - vld1.u8 {q5}, [r0], r1 //q5=src[3] - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q6}, [r2], r3 //write 1st 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q6}, [r2], r3 //write 4th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15 - vld1.u8 {q4}, [r0], r1 //read 6th row - vst1.u8 {q6}, [r2], r3 //write 5th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15 - vld1.u8 {q5}, [r0], r1 //read 7th row - vst1.u8 {q6}, [r2], r3 //write 6th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q6}, [r2], r3 //write 7th 16Byte - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15 - vst1.u8 {q6}, [r2], r3 //write 8th 16Byte - - //q2, q3, q4, q5, q0 --> q0~q4 - vswp q0, q4 - vswp q0, q2 - vmov q1, q3 - vmov q3, q5 //q0~q4 - - sub r4, #8 - cmp r4, #0 - bne w16_xy_03_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] - - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] - -w8_xy_03_mc_luma_loop: - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d12}, [r2], r3 //write 1st 8Byte - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte - - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d12}, [r2], r3 //write 4th 8Byte - - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 - - sub r4, #4 - cmp r4, #0 - bne w8_xy_03_mc_luma_loop - - pop {r4} - WELS_ASM_FUNC_END - - WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon - push {r4, r5, r6, r7} - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - ldr r4, [r0], r1 //r4=src[-2] - ldr r5, [r0], r1 //r5=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - ldr r6, [r0], r1 //r6=src[0] - ldr r7, [r0], r1 //r7=src[1] - - vmov d0, r4, r5 - vmov d1, r5, r6 - vmov d2, r6, r7 - - ldr r4, [r0], r1 //r4=src[2] - vmov d3, r7, r4 - ldr r7, [sp, #16] - -w4_xy_03_mc_luma_loop: - - //using reserving r4 - ldr r5, [r0], r1 //r5=src[3] - ldr r6, [r0], r1 //r6=src[0] - vmov d4, r4, r5 - vmov d5, r5, r6 //reserved r6 - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15 - vmov r4, r5, d12 - str r4, [r2], r3 //write 1st 4Byte - str r5, [r2], r3 //write 2nd 4Byte - - ldr r5, [r0], r1 //r5=src[1] - ldr r4, [r0], r1 //r4=src[2] - vmov d0, r6, r5 - vmov d1, r5, r4 //reserved r4 - - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15 - vmov r5, r6, d12 - str r5, [r2], r3 //write 3rd 4Byte - str r6, [r2], r3 //write 4th 4Byte - - //d4, d5, d0, d1 --> d0, d1, d2, d3 - vmov q1, q0 - vmov q0, q2 - - sub r7, #4 - cmp r7, #0 - bne w4_xy_03_mc_luma_loop - - pop {r4, r5, r6, r7} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q4}, [r0], r1 //q4=src[2] - -w16_v_mc_luma_loop: - - vld1.u8 {q5}, [r0], r1 //q5=src[3] - - FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q6}, [r2], r3 //write 1st 16Byte - - FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte - - FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte - - FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q6}, [r2], r3 //write 4th 16Byte - - FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 - vld1.u8 {q4}, [r0], r1 //read 6th row - vst1.u8 {q6}, [r2], r3 //write 5th 16Byte - - FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 - vld1.u8 {q5}, [r0], r1 //read 7th row - vst1.u8 {q6}, [r2], r3 //write 6th 16Byte - - FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q6}, [r2], r3 //write 7th 16Byte - - FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 - vst1.u8 {q6}, [r2], r3 //write 8th 16Byte - - //q2, q3, q4, q5, q0 --> q0~q4 - vswp q0, q4 - vswp q0, q2 - vmov q1, q3 - vmov q3, q5 //q0~q4 - - sub r4, #8 - cmp r4, #0 - bne w16_v_mc_luma_loop - pop {r4} - WELS_ASM_FUNC_END - - WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q4}, [r0], r1 //q4=src[2] - -w17_v_mc_luma_loop: - - vld1.u8 {q5}, [r0], r1 //q5=src[3] - - FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q6}, [r2], r3 //write 1st 16Byte - - FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte - - FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte - - FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q6}, [r2], r3 //write 4th 16Byte - - FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 - vld1.u8 {q4}, [r0], r1 //read 6th row - vst1.u8 {q6}, [r2], r3 //write 5th 16Byte - - FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 - vld1.u8 {q5}, [r0], r1 //read 7th row - vst1.u8 {q6}, [r2], r3 //write 6th 16Byte - - FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q6}, [r2], r3 //write 7th 16Byte - - FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 - vst1.u8 {q6}, [r2], r3 //write 8th 16Byte - - //q2, q3, q4, q5, q0 --> q0~q4 - vswp q0, q4 - vswp q0, q2 - vmov q1, q3 - vmov q3, q5 //q0~q4 - - sub r4, #8 - cmp r4, #1 - bne w17_v_mc_luma_loop - // the last 16Bytes - vld1.u8 {q5}, [r0], r1 //q5=src[3] - FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 - FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 - vst1.u8 {q6}, [r2], r3 //write 1st 16Byte - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] - - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] - -w9_v_mc_luma_loop: - - pld [r0] - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d12}, [r2], r3 //write 1st 8Byte - - pld [r0] - FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte - - pld [r0] - FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte - - pld [r0] - FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d12}, [r2], r3 //write 4th 8Byte - - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 - - sub r4, #4 - cmp r4, #1 - bne w9_v_mc_luma_loop - - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 - vst1.u8 {d12}, [r2], r3 //write last 8Byte - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon - push {r4, r5, r6, r7} - sub r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - ldr r4, [r0], r1 //r4=src[-2] - ldr r5, [r0], r1 //r5=src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - ldr r6, [r0], r1 //r6=src[0] - ldr r7, [r0], r1 //r7=src[1] - - vmov d0, r4, r5 - vmov d1, r5, r6 - vmov d2, r6, r7 - - ldr r4, [r0], r1 //r4=src[2] - vmov d3, r7, r4 - ldr r7, [sp, #16] - -w4_v_mc_luma_loop: - -// pld [r0] - //using reserving r4 - ldr r5, [r0], r1 //r5=src[3] - ldr r6, [r0], r1 //r6=src[0] - vmov d4, r4, r5 - vmov d5, r5, r6 //reserved r6 - - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 - vmov r4, r5, d12 - str r4, [r2], r3 //write 1st 4Byte - str r5, [r2], r3 //write 2nd 4Byte - - ldr r5, [r0], r1 //r5=src[1] - ldr r4, [r0], r1 //r4=src[2] - vmov d0, r6, r5 - vmov d1, r5, r4 //reserved r4 - - FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 - vmov r5, r6, d12 - str r5, [r2], r3 //write 3rd 4Byte - str r6, [r2], r3 //write 4th 4Byte - - //d4, d5, d0, d1 --> d0, d1, d2, d3 - vmov q1, q0 - vmov q0, q2 - - sub r7, #4 - cmp r7, #0 - bne w4_v_mc_luma_loop - - pop {r4, r5, r6, r7} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 //src[-2] - sub r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] - - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2] - vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - - vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0] - vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2] - -w16_hv_mc_luma_loop: - - vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] - vst1.u8 {q0}, [r2], r3 //write 16Byte - - - vld1.u8 {d0-d2}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 - vst1.u8 {d3, d4}, [r2], r3 //write 16Byte - - vld1.u8 {d3-d5}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 - vst1.u8 {d6, d7}, [r2], r3 //write 16Byte - - vld1.u8 {d6-d8}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 - vst1.u8 {d9, d10}, [r2], r3 //write 16Byte - - //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 - vswp q0, q6 - vswp q6, q3 - vmov q5, q2 - vmov q2, q8 - - vmov d20,d8 - vmov q4, q1 - vmov q1, q7 - vmov d14,d20 - - sub r4, #4 - cmp r4, #0 - bne w16_hv_mc_luma_loop - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 //src[-2] - sub r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] - - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2] - vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - - vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0] - vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2] - sub r3, #16 - -w17_hv_mc_luma_loop: - - vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] - vst1.u8 {d0, d1}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] - vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte - - vld1.u8 {d0-d2}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 - vst1.u8 {d3, d4}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0] - vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte - - vld1.u8 {d3-d5}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 - vst1.u8 {d6, d7}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0] - vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte - - vld1.u8 {d6-d8}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 - vst1.u8 {d9, d10}, [r2], r3 //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0] - vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte - - //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 - vswp q0, q6 - vswp q6, q3 - vmov q5, q2 - vmov q2, q8 - - vmov d20,d8 - vmov q4, q1 - vmov q1, q7 - vmov d14,d20 - - sub r4, #4 - cmp r4, #1 - bne w17_hv_mc_luma_loop - //the last row - vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] - vst1.u8 {q0}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] - vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon - push {r4} - ldr r4, [sp, #4] - - sub r0, #2 //src[-2] - sub r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] - - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2] - vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - - vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0] - vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2] - sub r3, #8 - -w9_hv_mc_luma_loop: - - vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q6/q7 - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] - vst1.u8 d12, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] - vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte - - vld1.u8 {q0}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q6/q7 - FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] - vst1.u8 d12, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] - vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte - - vld1.u8 {q1}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q6/q7 - FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] - vst1.u8 d12, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] - vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte - - vld1.u8 {q2}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q6/q7 - FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] - vst1.u8 d12, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] - vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte - - //q4~q5, q0~q2, --> q0~q4 - vswp q0, q4 - vswp q2, q4 - vmov q3, q1 - vmov q1, q5 - - sub r4, #4 - cmp r4, #1 - bne w9_hv_mc_luma_loop - //the last row - vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] - // vertical filtered into q6/q7 - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] - vst1.u8 d12, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] - vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon - push {r4 ,r5, r6} - ldr r6, [sp, #12] - - sub r0, #2 //src[-2] - sub r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] - - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2] - vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1] - - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - - vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0] - vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2] - -w4_hv_mc_luma_loop: - - vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3] - vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4] - - //the 1st&2nd row - pld [r0] - pld [r0, r1] - // vertical filtered - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail - - FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail - UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail - - vmov d23, d0 - vmov d25, d14 - vmov d27, d16 - - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] - vmov r4 ,r5, d22 - str r4, [r2], r3 //write 4Byte - str r5, [r2], r3 //write 4Byte - - //the 3rd&4th row - vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3] - vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4] - pld [r0] - pld [r0, r1] - // vertical filtered - FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail - - FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail - UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail - - vmov d23, d4 - vmov d25, d14 - vmov d27, d16 - - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] - vmov r4 ,r5, d22 - str r4, [r2], r3 //write 4Byte - str r5, [r2], r3 //write 4Byte - - //q4~q6, q0~q1, --> q0~q4 - vswp q4, q0 - vmov q3, q4 - vmov q4, q1 - vmov q1, q5 - vmov q2, q6 - - sub r6, #4 - cmp r6, #0 - bne w4_hv_mc_luma_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon - push {r4} - ldr r4, [sp, #4] -w16_copy_loop: - vld1.u8 {q0}, [r0], r1 - vld1.u8 {q1}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - vst1.u8 {q1}, [r2], r3 - sub r4, #2 - cmp r4, #0 - bne w16_copy_loop - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon - push {r4} - ldr r4, [sp, #4] -w8_copy_loop: - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vst1.u8 {d1}, [r2], r3 - sub r4, #2 - cmp r4, #0 - bne w8_copy_loop - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon - push {r4, r5, r6} - ldr r4, [sp, #12] -w4_copy_loop: - ldr r5, [r0], r1 - ldr r6, [r0], r1 - str r5, [r2], r3 - str r6, [r2], r3 - - sub r4, #2 - cmp r4, #0 - bne w4_copy_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon - push {r4} - ldr r4, [sp, #4] -w16_pix_avg_loop: - vld1.u8 {q0}, [r2]! - vld1.u8 {q1}, [r3]! - vld1.u8 {q2}, [r2]! - vld1.u8 {q3}, [r3]! - - vld1.u8 {q4}, [r2]! - vld1.u8 {q5}, [r3]! - vld1.u8 {q6}, [r2]! - vld1.u8 {q7}, [r3]! - - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {q0}, [r0], r1 - - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {q2}, [r0], r1 - - AVERAGE_TWO_8BITS d8, d8, d10 - AVERAGE_TWO_8BITS d9, d9, d11 - vst1.u8 {q4}, [r0], r1 - - AVERAGE_TWO_8BITS d12, d12, d14 - AVERAGE_TWO_8BITS d13, d13, d15 - vst1.u8 {q6}, [r0], r1 - - sub r4, #4 - cmp r4, #0 - bne w16_pix_avg_loop - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon - push {r4, r5, r6} - ldr r4, [sp, #12] - ldr r5, [sp, #16] - ldr r6, [sp, #20] - -enc_w16_pix_avg_loop: - vld1.u8 {q0}, [r2], r3 - vld1.u8 {q1}, [r4], r5 - vld1.u8 {q2}, [r2], r3 - vld1.u8 {q3}, [r4], r5 - - vld1.u8 {q4}, [r2], r3 - vld1.u8 {q5}, [r4], r5 - vld1.u8 {q6}, [r2], r3 - vld1.u8 {q7}, [r4], r5 - - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {q0}, [r0], r1 - - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {q2}, [r0], r1 - - AVERAGE_TWO_8BITS d8, d8, d10 - AVERAGE_TWO_8BITS d9, d9, d11 - vst1.u8 {q4}, [r0], r1 - - AVERAGE_TWO_8BITS d12, d12, d14 - AVERAGE_TWO_8BITS d13, d13, d15 - vst1.u8 {q6}, [r0], r1 - - sub r6, #4 - cmp r6, #0 - bne enc_w16_pix_avg_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon - push {r4, r5, r6} - ldr r4, [sp, #12] - ldr r5, [sp, #16] - ldr r6, [sp, #20] -enc_w8_pix_avg_loop: - - vld1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r4], r5 - vld1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r4], r5 - - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {d0}, [r0], r1 - vst1.u8 {d1}, [r0], r1 - - vld1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r4], r5 - vld1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r4], r5 - - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {d4}, [r0], r1 - vst1.u8 {d5}, [r0], r1 - - sub r6, #4 - cmp r6, #0 - bne enc_w8_pix_avg_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon - - push {r4, r5} - ldr r4, [sp, #8] - ldr r5, [sp, #12] - vld1.u8 {d31}, [r4] //load A/B/C/D - vld1.u8 {q0}, [r0], r1 //src[x] - - vdup.u8 d28, d31[0] //A - vdup.u8 d29, d31[1] //B - vdup.u8 d30, d31[2] //C - vdup.u8 d31, d31[3] //D - - vext.u8 d1, d0, d1, #1 //src[x+1] - -w8_mc_chroma_loop: // each two pxl row - vld1.u8 {q1}, [r0], r1 //src[x+stride] - vld1.u8 {q2}, [r0], r1 //src[x+2*stride] - vext.u8 d3, d2, d3, #1 //src[x+stride+1] - vext.u8 d5, d4, d5, #1 //src[x+2*stride+1] - - vmull.u8 q3, d0, d28 //(src[x] * A) - vmlal.u8 q3, d1, d29 //+=(src[x+1] * B) - vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C) - vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D) - vrshrn.u16 d6, q3, #6 - vst1.u8 d6, [r2], r3 - - vmull.u8 q3, d2, d28 //(src[x] * A) - vmlal.u8 q3, d3, d29 //+=(src[x+1] * B) - vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C) - vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D) - vrshrn.u16 d6, q3, #6 - vst1.u8 d6, [r2], r3 - - vmov q0, q2 - sub r5, #2 - cmp r5, #0 - bne w8_mc_chroma_loop - - pop {r4, r5} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon - - push {r4, r5, r6} - ldr r4, [sp, #12] - ldr r6, [sp, #16] - vld1.u8 {d31}, [r4] //load A/B/C/D - - vdup.u8 d28, d31[0] //A - vdup.u8 d29, d31[1] //B - vdup.u8 d30, d31[2] //C - vdup.u8 d31, d31[3] //D - -w4_mc_chroma_loop: // each two pxl row - vld1.u8 {d0}, [r0], r1 //a::src[x] - vld1.u8 {d2}, [r0], r1 //b::src[x+stride] - vld1.u8 {d4}, [r0] //c::src[x+2*stride] - - vshr.u64 d1, d0, #8 - vshr.u64 d3, d2, #8 - vshr.u64 d5, d4, #8 - - vmov q3, q1 //b::[0:7]+b::[1~8] - vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]} - vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]} - - vmull.u8 q1, d0, d28 //(src[x] * A) - vmlal.u8 q1, d1, d29 //+=(src[x+1] * B) - vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C) - vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D) - - vrshrn.u16 d2, q1, #6 - vmov r4, r5, d2 - str r4, [r2], r3 - str r5, [r2], r3 - - sub r6, #2 - cmp r6, #0 - bne w4_mc_chroma_loop - - pop {r4, r5, r6} -WELS_ASM_FUNC_END -#endif +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + +#ifdef APPLE_IOS +.macro AVERAGE_TWO_8BITS +// { // input:dst_d, src_d A and B; working: q13 + vaddl.u8 q13, $2, $1 + vrshrn.u16 $0, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 +// } +.endm + +.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}, + vrev64.8 $2, $0 // X[5][4][3][2][1][0]O + vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 $0, $0, $0 + vpadd.s16 $0, $0, $0 + vqrshrun.s16 $0, $4, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 + vaddl.u8 q13, $2, $6 + vrshrn.u16 $6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 + vaddl.u8 q13, $3, $6 + vrshrn.u16 $6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, + vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS +// { // input:a, b, c, dst_d; + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $3, $0, #6 //(+32)>>6 +// } +.endm + +.macro UNPACK_2_16BITS_TO_ABC +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a + vext.16 $4, $0, $1, #2 //src[0] + vext.16 $3, $0, $1, #3 //src[1] + vadd.s16 $4, $3 //c=src[0]+src[1] + + vext.16 $3, $0, $1, #1 //src[-1] + vext.16 $2, $0, $1, #4 //src[2] + vadd.s16 $3, $2 //b=src[-1]+src[2] + + vext.16 $2, $0, $1, #5 //src[3] + vadd.s16 $2, $0 //a=src[-2]+src[3] +// } +.endm + +.macro UNPACK_1_IN_8x16BITS_TO_8BITS +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5], + vrev64.16 $1, $1 + vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5], + vshr.s64 $1, $2, #16 + vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0 + + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $0, $3, #6 //(+32)>>6 +// } +.endm +#else +.macro AVERAGE_TWO_8BITS arg0, arg1,arg2 +// { // input:dst_d, src_d A and B; working: q13 + vaddl.u8 q13, \arg2, \arg1 + vrshrn.u16 \arg0, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 +// } +.endm + +.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2} + vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O + vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 \arg0, \arg0, \arg0 + vpadd.s16 \arg0, \arg0, \arg0 + vqrshrun.s16 \arg0, \arg4, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 + vaddl.u8 q13, \arg2, \arg6 + vrshrn.u16 \arg6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 + vaddl.u8 q13, \arg3, \arg6 + vrshrn.u16 \arg6, q13, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3] + vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3 +// { // input:a, b, c, dst_d; + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6 +// } +.endm + +.macro UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4 +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5) + vext.16 \arg4, \arg0, \arg1, #2 //src[0] + vext.16 \arg3, \arg0, \arg1, #3 //src[1] + vadd.s16 \arg4, \arg3 //c=src[0]+src[1] + + vext.16 \arg3, \arg0, \arg1, #1 //src[-1] + vext.16 \arg2, \arg0, \arg1, #4 //src[2] + vadd.s16 \arg3, \arg2 //b=src[-1]+src[2] + + vext.16 \arg2, \arg0, \arg1, #5 //src[3] + vadd.s16 \arg2, \arg0 //a=src[-2]+src[3] +// } +.endm + +.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3 +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5] + vrev64.16 \arg1, \arg1 + vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5] + vshr.s64 \arg1, \arg2, #16 + vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0 + + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6 +// } +.endm +#endif + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w16_h_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15 + + FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15 + + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + + cmp r4, #0 + bne w16_h_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d3, r5, r4 // 0x0014FFFB00010000 + + sub r3, #16 + ldr r4, [sp, #8] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w17_h_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15 + + FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15 + + vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte + + vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1 + + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + sub r4, #1 + cmp r4, #0 + bne w17_h_mc_luma_loop + pop {r4-r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d7, r5, r4 // 0x0014FFFB00010000 + + sub r3, #8 + ldr r4, [sp, #8] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w9_h_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15 + + sub r4, #1 + vst1.u8 {d8}, [r2]! //write [0:7] Byte + + vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1 + vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte + + cmp r4, #0 + bne w9_h_mc_luma_loop + pop {r4-r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon + push {r4, r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w4_h_mc_luma_loop: + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] + + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q4, q2, q2, #2 //src[1:6 * *] + + vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + + FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15 + + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_h_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w16_xy_10_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15 + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15 + + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + + cmp r4, #0 + bne w16_xy_10_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w8_xy_10_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15 + + sub r4, #1 + vst1.u8 {d1}, [r2], r3 + + cmp r4, #0 + bne w8_xy_10_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon + push {r4, r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w4_xy_10_mc_luma_loop: + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] + + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q4, q2, q2, #2 //src[1:6 * *] + + vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15 + + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_xy_10_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w16_xy_30_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15 + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15 + + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + + cmp r4, #0 + bne w16_xy_30_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w8_xy_30_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15 + + sub r4, #1 + vst1.u8 {d1}, [r2], r3 + + cmp r4, #0 + bne w8_xy_30_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon + push {r4, r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w4_xy_30_mc_luma_loop: + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] + + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q4, q2, q2, #2 //src[1:6 * *] + + vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15 + + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_xy_30_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w16_xy_01_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #0 + bne w16_xy_01_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w8_xy_01_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #0 + bne w8_xy_01_mc_luma_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon + push {r4, r5, r6, r7} + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] + + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 + + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] + +w4_xy_01_mc_luma_loop: + + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vmov r4, r5, d12 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte + + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 + + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vmov r5, r6, d12 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte + + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 + + sub r7, #4 + cmp r7, #0 + bne w4_xy_01_mc_luma_loop + + pop {r4, r5, r6, r7} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w16_xy_03_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #0 + bne w16_xy_03_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w8_xy_03_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #0 + bne w8_xy_03_mc_luma_loop + + pop {r4} + WELS_ASM_FUNC_END + + WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon + push {r4, r5, r6, r7} + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] + + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 + + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] + +w4_xy_03_mc_luma_loop: + + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15 + vmov r4, r5, d12 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte + + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 + + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15 + vmov r5, r6, d12 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte + + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 + + sub r7, #4 + cmp r7, #0 + bne w4_xy_03_mc_luma_loop + + pop {r4, r5, r6, r7} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w16_v_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #0 + bne w16_v_mc_luma_loop + pop {r4} + WELS_ASM_FUNC_END + + WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w17_v_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #1 + bne w17_v_mc_luma_loop + // the last 16Bytes + vld1.u8 {q5}, [r0], r1 //q5=src[3] + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w9_v_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #1 + bne w9_v_mc_luma_loop + + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vst1.u8 {d12}, [r2], r3 //write last 8Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon + push {r4, r5, r6, r7} + sub r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] + + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 + + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] + +w4_v_mc_luma_loop: + +// pld [r0] + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 + + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vmov r4, r5, d12 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte + + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 + + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 + vmov r5, r6, d12 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte + + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 + + sub r7, #4 + cmp r7, #0 + bne w4_v_mc_luma_loop + + pop {r4, r5, r6, r7} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2] + +w16_hv_mc_luma_loop: + + vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2], r3 //write 16Byte + + + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + vst1.u8 {d3, d4}, [r2], r3 //write 16Byte + + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2], r3 //write 16Byte + + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2], r3 //write 16Byte + + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 + + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 + + sub r4, #4 + cmp r4, #0 + bne w16_hv_mc_luma_loop + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2] + sub r3, #16 + +w17_hv_mc_luma_loop: + + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {d0, d1}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + vst1.u8 {d3, d4}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0] + vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0] + vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2], r3 //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0] + vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte + + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 + + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 + + sub r4, #4 + cmp r4, #1 + bne w17_hv_mc_luma_loop + //the last row + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2] + sub r3, #8 + +w9_hv_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q0}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q1}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q2}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + //q4~q5, q0~q2, --> q0~q4 + vswp q0, q4 + vswp q2, q4 + vmov q3, q1 + vmov q1, q5 + + sub r4, #4 + cmp r4, #1 + bne w9_hv_mc_luma_loop + //the last row + vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon + push {r4 ,r5, r6} + ldr r6, [sp, #12] + + sub r0, #2 //src[-2] + sub r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2] + +w4_hv_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3] + vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4] + + //the 1st&2nd row + pld [r0] + pld [r0, r1] + // vertical filtered + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail + + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail + UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail + + vmov d23, d0 + vmov d25, d14 + vmov d27, d16 + + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] + vmov r4 ,r5, d22 + str r4, [r2], r3 //write 4Byte + str r5, [r2], r3 //write 4Byte + + //the 3rd&4th row + vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3] + vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4] + pld [r0] + pld [r0, r1] + // vertical filtered + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail + + FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail + UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail + + vmov d23, d4 + vmov d25, d14 + vmov d27, d16 + + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] + vmov r4 ,r5, d22 + str r4, [r2], r3 //write 4Byte + str r5, [r2], r3 //write 4Byte + + //q4~q6, q0~q1, --> q0~q4 + vswp q4, q0 + vmov q3, q4 + vmov q4, q1 + vmov q1, q5 + vmov q2, q6 + + sub r6, #4 + cmp r6, #0 + bne w4_hv_mc_luma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon + push {r4} + ldr r4, [sp, #4] +w16_copy_loop: + vld1.u8 {q0}, [r0], r1 + vld1.u8 {q1}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + vst1.u8 {q1}, [r2], r3 + sub r4, #2 + cmp r4, #0 + bne w16_copy_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon + push {r4} + ldr r4, [sp, #4] +w8_copy_loop: + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vst1.u8 {d1}, [r2], r3 + sub r4, #2 + cmp r4, #0 + bne w8_copy_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon + push {r4, r5, r6} + ldr r4, [sp, #12] +w4_copy_loop: + ldr r5, [r0], r1 + ldr r6, [r0], r1 + str r5, [r2], r3 + str r6, [r2], r3 + + sub r4, #2 + cmp r4, #0 + bne w4_copy_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon + push {r4} + ldr r4, [sp, #4] +w16_pix_avg_loop: + vld1.u8 {q0}, [r2]! + vld1.u8 {q1}, [r3]! + vld1.u8 {q2}, [r2]! + vld1.u8 {q3}, [r3]! + + vld1.u8 {q4}, [r2]! + vld1.u8 {q5}, [r3]! + vld1.u8 {q6}, [r2]! + vld1.u8 {q7}, [r3]! + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 + + AVERAGE_TWO_8BITS d8, d8, d10 + AVERAGE_TWO_8BITS d9, d9, d11 + vst1.u8 {q4}, [r0], r1 + + AVERAGE_TWO_8BITS d12, d12, d14 + AVERAGE_TWO_8BITS d13, d13, d15 + vst1.u8 {q6}, [r0], r1 + + sub r4, #4 + cmp r4, #0 + bne w16_pix_avg_loop + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] + +enc_w16_pix_avg_loop: + vld1.u8 {q0}, [r2], r3 + vld1.u8 {q1}, [r4], r5 + vld1.u8 {q2}, [r2], r3 + vld1.u8 {q3}, [r4], r5 + + vld1.u8 {q4}, [r2], r3 + vld1.u8 {q5}, [r4], r5 + vld1.u8 {q6}, [r2], r3 + vld1.u8 {q7}, [r4], r5 + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 + + AVERAGE_TWO_8BITS d8, d8, d10 + AVERAGE_TWO_8BITS d9, d9, d11 + vst1.u8 {q4}, [r0], r1 + + AVERAGE_TWO_8BITS d12, d12, d14 + AVERAGE_TWO_8BITS d13, d13, d15 + vst1.u8 {q6}, [r0], r1 + + sub r6, #4 + cmp r6, #0 + bne enc_w16_pix_avg_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] +enc_w8_pix_avg_loop: + + vld1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r4], r5 + vld1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r4], r5 + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {d0}, [r0], r1 + vst1.u8 {d1}, [r0], r1 + + vld1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r4], r5 + vld1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r4], r5 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {d4}, [r0], r1 + vst1.u8 {d5}, [r0], r1 + + sub r6, #4 + cmp r6, #0 + bne enc_w8_pix_avg_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon + + push {r4, r5} + ldr r4, [sp, #8] + ldr r5, [sp, #12] + vld1.u8 {d31}, [r4] //load A/B/C/D + vld1.u8 {q0}, [r0], r1 //src[x] + + vdup.u8 d28, d31[0] //A + vdup.u8 d29, d31[1] //B + vdup.u8 d30, d31[2] //C + vdup.u8 d31, d31[3] //D + + vext.u8 d1, d0, d1, #1 //src[x+1] + +w8_mc_chroma_loop: // each two pxl row + vld1.u8 {q1}, [r0], r1 //src[x+stride] + vld1.u8 {q2}, [r0], r1 //src[x+2*stride] + vext.u8 d3, d2, d3, #1 //src[x+stride+1] + vext.u8 d5, d4, d5, #1 //src[x+2*stride+1] + + vmull.u8 q3, d0, d28 //(src[x] * A) + vmlal.u8 q3, d1, d29 //+=(src[x+1] * B) + vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C) + vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D) + vrshrn.u16 d6, q3, #6 + vst1.u8 d6, [r2], r3 + + vmull.u8 q3, d2, d28 //(src[x] * A) + vmlal.u8 q3, d3, d29 //+=(src[x+1] * B) + vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C) + vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D) + vrshrn.u16 d6, q3, #6 + vst1.u8 d6, [r2], r3 + + vmov q0, q2 + sub r5, #2 + cmp r5, #0 + bne w8_mc_chroma_loop + + pop {r4, r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon + + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r6, [sp, #16] + vld1.u8 {d31}, [r4] //load A/B/C/D + + vdup.u8 d28, d31[0] //A + vdup.u8 d29, d31[1] //B + vdup.u8 d30, d31[2] //C + vdup.u8 d31, d31[3] //D + +w4_mc_chroma_loop: // each two pxl row + vld1.u8 {d0}, [r0], r1 //a::src[x] + vld1.u8 {d2}, [r0], r1 //b::src[x+stride] + vld1.u8 {d4}, [r0] //c::src[x+2*stride] + + vshr.u64 d1, d0, #8 + vshr.u64 d3, d2, #8 + vshr.u64 d5, d4, #8 + + vmov q3, q1 //b::[0:7]+b::[1~8] + vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]} + vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]} + + vmull.u8 q1, d0, d28 //(src[x] * A) + vmlal.u8 q1, d1, d29 //+=(src[x+1] * B) + vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C) + vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D) + + vrshrn.u16 d2, q1, #6 + vmov r4, r5, d2 + str r4, [r2], r3 + str r5, [r2], r3 + + sub r6, #2 + cmp r6, #0 + bne w4_mc_chroma_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END +#endif diff --git a/codec/encoder/core/arm/memory_neon.S b/codec/encoder/core/arm/memory_neon.S old mode 100755 new mode 100644 index 7b65d490..1def1b6e --- a/codec/encoder/core/arm/memory_neon.S +++ b/codec/encoder/core/arm/memory_neon.S @@ -60,4 +60,4 @@ mem_zero_24_neon_start: vst1.64 {d0}, [r0]! WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/encoder/core/arm/pixel_neon.S b/codec/encoder/core/arm/pixel_neon.S old mode 100755 new mode 100644 index 792fba34..be423a57 --- a/codec/encoder/core/arm/pixel_neon.S +++ b/codec/encoder/core/arm/pixel_neon.S @@ -35,73 +35,73 @@ #include "arm_arch_common_macro.S" .macro SATD_16x4 - vld1.64 {q0}, [r0,:128], r1 - vld1.64 {q1}, [r2], r3 + vld1.64 {q0}, [r0,:128], r1 + vld1.64 {q1}, [r2], r3 - vsubl.u8 q4, d0, d2 - vld1.64 {q2}, [r0,:128], r1 + vsubl.u8 q4, d0, d2 + vld1.64 {q2}, [r0,:128], r1 - vsubl.u8 q6, d1, d3 - vld1.64 {q3}, [r2], r3 + vsubl.u8 q6, d1, d3 + vld1.64 {q3}, [r2], r3 - vsubl.u8 q5, d4, d6 - vld1.64 {q0}, [r0,:128], r1 + vsubl.u8 q5, d4, d6 + vld1.64 {q0}, [r0,:128], r1 - vsubl.u8 q7, d5, d7 + vsubl.u8 q7, d5, d7 vld1.64 {q1}, [r2], r3 vsubl.u8 q8, d0, d2 - vld1.64 {q2}, [r0,:128], r1 + vld1.64 {q2}, [r0,:128], r1 vsubl.u8 q10, d1, d3 - vadd.s16 q0, q4, q5 + vadd.s16 q0, q4, q5 - vld1.64 {q3}, [r2], r3 - vsub.s16 q1, q4, q5 + vld1.64 {q3}, [r2], r3 + vsub.s16 q1, q4, q5 - vsubl.u8 q9, d4, d6 - vsubl.u8 q11, d5, d7 + vsubl.u8 q9, d4, d6 + vsubl.u8 q11, d5, d7 - vadd.s16 q2, q8, q9 - vsub.s16 q3, q8, q9 + vadd.s16 q2, q8, q9 + vsub.s16 q3, q8, q9 - vadd.s16 q4, q6, q7 + vadd.s16 q4, q6, q7 vsub.s16 q5, q6, q7 - vadd.s16 q6, q10, q11 - vsub.s16 q7, q10, q11 + vadd.s16 q6, q10, q11 + vsub.s16 q7, q10, q11 - vadd.s16 q8, q0, q2 - vsub.s16 q10, q0, q2 + vadd.s16 q8, q0, q2 + vsub.s16 q10, q0, q2 - vadd.s16 q9, q4, q6 - vsub.s16 q11, q4, q6 + vadd.s16 q9, q4, q6 + vsub.s16 q11, q4, q6 - vsub.s16 q0, q1, q3 - vadd.s16 q2, q1, q3 + vsub.s16 q0, q1, q3 + vadd.s16 q2, q1, q3 - vsub.s16 q1, q5, q7 - vadd.s16 q3, q5, q7 + vsub.s16 q1, q5, q7 + vadd.s16 q3, q5, q7 - vtrn.16 q8, q10 - vtrn.16 q9, q11 + vtrn.16 q8, q10 + vtrn.16 q9, q11 - vadd.s16 q4, q8, q10 - vabd.s16 q6, q8, q10 + vadd.s16 q4, q8, q10 + vabd.s16 q6, q8, q10 - vadd.s16 q5, q9, q11 - vabd.s16 q7, q9, q11 + vadd.s16 q5, q9, q11 + vabd.s16 q7, q9, q11 vabs.s16 q4, q4 vabs.s16 q5, q5 - vtrn.16 q0, q2 - vtrn.16 q1, q3 + vtrn.16 q0, q2 + vtrn.16 q1, q3 - vadd.s16 q8, q0, q2 - vabd.s16 q10, q0, q2 + vadd.s16 q8, q0, q2 + vabd.s16 q10, q0, q2 - vadd.s16 q9, q1, q3 + vadd.s16 q9, q1, q3 vabd.s16 q11, q1, q3 vabs.s16 q8, q8 @@ -128,31 +128,31 @@ vld1.64 {d1}, [r2], r3 vld1.64 {d2}, [r0,:64], r1 - vsubl.u8 q4, d0, d1 + vsubl.u8 q4, d0, d1 vld1.64 {d3}, [r2], r3 - vsubl.u8 q5, d2, d3 + vsubl.u8 q5, d2, d3 vld1.64 {d4}, [r0,:64], r1 vld1.64 {d5}, [r2], r3 - vadd.s16 q8, q4, q5 - vsubl.u8 q6, d4, d5 + vadd.s16 q8, q4, q5 + vsubl.u8 q6, d4, d5 vld1.64 {d6}, [r0,:64], r1 vld1.64 {d7}, [r2], r3 - vsubl.u8 q7, d6, d7 - vsub.s16 q9, q4, q5 + vsubl.u8 q7, d6, d7 + vsub.s16 q9, q4, q5 - vadd.s16 q10, q6, q7 - vsub.s16 q11, q6, q7 + vadd.s16 q10, q6, q7 + vsub.s16 q11, q6, q7 - vadd.s16 q0, q8, q10 - vsub.s16 q1, q8, q10 + vadd.s16 q0, q8, q10 + vsub.s16 q1, q8, q10 - vsub.s16 q2, q9, q11 - vadd.s16 q3, q9, q11 + vsub.s16 q2, q9, q11 + vadd.s16 q3, q9, q11 vtrn.16 q0, q1 vtrn.16 q2, q3 @@ -220,7 +220,7 @@ .endm -WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon +WELS_ASM_FUNC_BEGIN WelsSampleSad16x16_neon vld1.64 {q0}, [r0, :128], r1 vld1.64 {q1}, [r2], r3 @@ -260,7 +260,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSad16x8_neon vld1.64 {q0}, [r0, :128], r1 vld1.64 {q1}, [r2], r3 @@ -298,7 +298,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon +WELS_ASM_FUNC_BEGIN WelsSampleSad8x16_neon vld1.64 {d0}, [r0, :64], r1 vld1.64 {d1}, [r2], r3 @@ -332,7 +332,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon vld1.64 {d0}, [r0, :64], r1 vld1.64 {d1}, [r2], r3 @@ -364,7 +364,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon +WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon stmdb sp!, {r4-r5, lr} //Loading a horizontal line data (4 bytes) @@ -376,23 +376,23 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon //line 1 ldr r4, [r0], r1 ldr r5, [r2], r3 - usada8 lr, r4, r5, lr + usada8 lr, r4, r5, lr - //line 2 + //line 2 ldr r4, [r0], r1 ldr r5, [r2], r3 - usada8 lr, r4, r5, lr - + usada8 lr, r4, r5, lr + //line 3 ldr r4, [r0] ldr r5, [r2] - usada8 r0, r4, r5, lr + usada8 r0, r4, r5, lr ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon +WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon stmdb sp!, {r4-r5, lr} @@ -400,30 +400,30 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon sub r4, r2, #1 add r5, r2, #1 sub r2, r3 - + //Loading a horizontal line data (16 bytes) vld1.8 {q0}, [r0], r1 //save pix1 - + vld1.8 {q1}, [r2], r3 //save pix2 - stride vld1.8 {q6}, [r2], r3 //save pix2 vld1.8 {q2}, [r2], r3 //save pix2 + stride - + vld1.8 {q3}, [r4], r3 //save pix2 - 1 - vld1.8 {q4}, [r5], r3 //save pix2 + 1 - + vld1.8 {q4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 16 bytes vabdl.u8 q15, d0, d2 vabal.u8 q15, d1, d3 - + vabdl.u8 q13, d0, d4 vabal.u8 q13, d1, d5 - + vabdl.u8 q11, d0, d6 vabal.u8 q11, d1, d7 - + vabdl.u8 q9, d0, d8 - vabal.u8 q9, d1, d9 - + vabal.u8 q9, d1, d9 + mov lr, #15 pixel_sad_4_16x16_loop_0: @@ -436,13 +436,13 @@ pixel_sad_4_16x16_loop_0: vabal.u8 q15, d1, d3 vld1.8 {q3}, [r4], r3 //save pix2 - 1 vabal.u8 q13, d0, d4 - vld1.8 {q4}, [r5], r3 //save pix2 + 1 + vld1.8 {q4}, [r5], r3 //save pix2 + 1 vabal.u8 q13, d1, d5 subs lr, #1 vabal.u8 q11, d0, d6 vabal.u8 q11, d1, d7 - + vabal.u8 q9, d0, d8 vabal.u8 q9, d1, d9 @@ -451,18 +451,18 @@ pixel_sad_4_16x16_loop_0: //Save SAD to 'r0' ldr r0, [sp, #12] - + vadd.u16 d0, d30, d31 vadd.u16 d1, d26, d27 vadd.u16 d2, d22, d23 vadd.u16 d3, d18, d19 - + vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 - + vpaddl.u32 q0, q0 vpaddl.u32 q1, q1 - + vshl.u32 q0, #4 vshl.u32 q1, #4 vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] @@ -471,37 +471,37 @@ pixel_sad_4_16x16_loop_0: WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon stmdb sp!, {r4-r5, lr} - + //Generate the pix2 start addr sub r4, r2, #1 add r5, r2, #1 sub r2, r3 - + //Loading a horizontal line data (16 bytes) vld1.8 {q0}, [r0], r1 //save pix1 - + vld1.8 {q1}, [r2], r3 //save pix2 - stride vld1.8 {q6}, [r2], r3 //save pix2 vld1.8 {q2}, [r2], r3 //save pix2 + stride - + vld1.8 {q3}, [r4], r3 //save pix2 - 1 - vld1.8 {q4}, [r5], r3 //save pix2 + 1 - + vld1.8 {q4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 16 bytes vabdl.u8 q15, d0, d2 vabal.u8 q15, d1, d3 - + vabdl.u8 q13, d0, d4 vabal.u8 q13, d1, d5 - + vabdl.u8 q11, d0, d6 vabal.u8 q11, d1, d7 - + vabdl.u8 q9, d0, d8 - vabal.u8 q9, d1, d9 - + vabal.u8 q9, d1, d9 + mov lr, #7 pixel_sad_4_16x8_loop_0: @@ -514,67 +514,67 @@ pixel_sad_4_16x8_loop_0: vabal.u8 q15, d1, d3 vld1.8 {q3}, [r4], r3 //save pix2 - 1 vabal.u8 q13, d0, d4 - vld1.8 {q4}, [r5], r3 //save pix2 + 1 + vld1.8 {q4}, [r5], r3 //save pix2 + 1 vabal.u8 q13, d1, d5 subs lr, #1 vabal.u8 q11, d0, d6 vabal.u8 q11, d1, d7 - + vabal.u8 q9, d0, d8 vabal.u8 q9, d1, d9 - + bne pixel_sad_4_16x8_loop_0 //Save SAD to 'r0' ldr r0, [sp, #12] - + vadd.u16 d0, d30, d31 vadd.u16 d1, d26, d27 vadd.u16 d2, d22, d23 vadd.u16 d3, d18, d19 - + vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 - + vpaddl.u32 q0, q0 vpaddl.u32 q1, q1 - + vshl.u32 q0, #4 vshl.u32 q1, #4 vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END - -WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon + +WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon stmdb sp!, {r4-r5, lr} - + //Generate the pix2 start addr sub r4, r2, #1 add r5, r2, #1 sub r2, r3 - + //Loading a horizontal line data (8 bytes) vld1.8 {d0}, [r0], r1 //save pix1 - + vld1.8 {d1}, [r2], r3 //save pix2 - stride vld1.8 {d6}, [r2], r3 //save pix2 vld1.8 {d2}, [r2], r3 //save pix2 + stride - + vld1.8 {d3}, [r4], r3 //save pix2 - 1 - vld1.8 {d4}, [r5], r3 //save pix2 + 1 - + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 8 bytes vabdl.u8 q15, d0, d1 vabdl.u8 q14, d0, d2 vabdl.u8 q13, d0, d3 - vabdl.u8 q12, d0, d4 - + vabdl.u8 q12, d0, d4 + mov lr, #15 pixel_sad_4_8x16_loop_0: - + //Loading a horizontal line data (8 bytes) vld1.8 {d0}, [r0], r1 //save pix1 vmov.8 d1, d6 //save pix2 - stride @@ -582,7 +582,7 @@ pixel_sad_4_8x16_loop_0: vld1.8 {d2}, [r2], r3 //save pix2 + stride vld1.8 {d3}, [r4], r3 //save pix2 - 1 vabal.u8 q15, d0, d1 - + vld1.8 {d4}, [r5], r3 //save pix2 + 1 //Do the SAD for 8 bytes vabal.u8 q14, d0, d2 @@ -594,50 +594,50 @@ pixel_sad_4_8x16_loop_0: //Save SAD to 'r0' ldr r0, [sp, #12] - + vadd.u16 d0, d30, d31 vadd.u16 d1, d28, d29 vadd.u16 d2, d26, d27 vadd.u16 d3, d24, d25 - + vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 - + vpaddl.u32 q0, q0 vpaddl.u32 q1, q1 - + vshl.u32 q0, #4 vshl.u32 q1, #4 vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon stmdb sp!, {r4-r5, lr} - + //Generate the pix2 start addr sub r4, r2, #1 add r5, r2, #1 sub r2, r3 - + //Loading a horizontal line data (8 bytes) vld1.8 {d0}, [r0], r1 //save pix1 - + vld1.8 {d1}, [r2], r3 //save pix2 - stride vld1.8 {d6}, [r2], r3 //save pix2 vld1.8 {d2}, [r2], r3 //save pix2 + stride - + vld1.8 {d3}, [r4], r3 //save pix2 - 1 - vld1.8 {d4}, [r5], r3 //save pix2 + 1 - + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 8 bytes vabdl.u8 q15, d0, d1 vabdl.u8 q14, d0, d2 vabdl.u8 q13, d0, d3 - vabdl.u8 q12, d0, d4 - + vabdl.u8 q12, d0, d4 + mov lr, #7 pixel_sad_4_8x8_loop_0: @@ -648,7 +648,7 @@ pixel_sad_4_8x8_loop_0: vld1.8 {d2}, [r2], r3 //save pix2 + stride vld1.8 {d3}, [r4], r3 //save pix2 - 1 vabal.u8 q15, d0, d1 - + vld1.8 {d4}, [r5], r3 //save pix2 + 1 //Do the SAD for 8 bytes vabal.u8 q14, d0, d2 @@ -659,84 +659,84 @@ pixel_sad_4_8x8_loop_0: //Save SAD to 'r0' ldr r0, [sp, #12] - + vadd.u16 d0, d30, d31 vadd.u16 d1, d28, d29 vadd.u16 d2, d26, d27 vadd.u16 d3, d24, d25 - + vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 - + vpaddl.u32 q0, q0 vpaddl.u32 q1, q1 - + vshl.u32 q0, #4 vshl.u32 q1, #4 vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon +WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon vld1.32 {d0[0]}, [r0], r1 vld1.32 {d0[1]}, [r0], r1 vld1.32 {d1[0]}, [r0], r1 vld1.32 {d1[1]}, [r0] - - + + sub r0, r2, r3 vld1.32 {d2[0]}, [r0], r3 vld1.32 {d2[1]}, [r0], r3 vld1.32 {d3[0]}, [r0], r3 vld1.32 {d3[1]}, [r0], r3 vld1.32 {d4[0]}, [r0], r3 - vld1.32 {d4[1]}, [r0] - - sub r0, r2, #1 + vld1.32 {d4[1]}, [r0] + + sub r0, r2, #1 vld1.32 {d5[0]}, [r0], r3 vld1.32 {d5[1]}, [r0], r3 vld1.32 {d6[0]}, [r0], r3 - vld1.32 {d6[1]}, [r0] - - add r0, r2, #1 + vld1.32 {d6[1]}, [r0] + + add r0, r2, #1 vld1.32 {d7[0]}, [r0], r3 vld1.32 {d7[1]}, [r0], r3 vld1.32 {d8[0]}, [r0], r3 vld1.32 {d8[1]}, [r0] - + vabdl.u8 q15, d0, d2 vabdl.u8 q14, d1, d3 - + vabdl.u8 q13, d0, d3 vabdl.u8 q12, d1, d4 - + vabdl.u8 q11, d0, d5 vabdl.u8 q10, d1, d6 - + vabdl.u8 q9, d0, d7 vabdl.u8 q8, d1, d8 - + //Save SAD to 'r4' ldr r0, [sp] vadd.u16 q0, q14, q15 vadd.u16 q1, q12, q13 vadd.u16 q2, q10, q11 vadd.u16 q3, q8 , q9 - + vadd.u16 d0, d1 vadd.u16 d1, d2, d3 vadd.u16 d2, d4, d5 vadd.u16 d3, d6, d7 - + vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 - + vpaddl.u32 q0, q0 vpaddl.u32 q1, q1 - + vshl.u32 q0, #4 vshl.u32 q1, #4 vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] @@ -744,7 +744,7 @@ WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon +WELS_ASM_FUNC_BEGIN WelsSampleSatd16x16_neon SATD_16x4 vadd.u16 q15, q0, q2 @@ -769,7 +769,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSatd16x8_neon SATD_16x4 vadd.u16 q15, q0, q2 @@ -786,7 +786,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon +WELS_ASM_FUNC_BEGIN WelsSampleSatd8x16_neon SATD_8x4 vadd.u16 q15, q0, q1 @@ -811,7 +811,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSatd8x8_neon SATD_8x4 vadd.u16 q15, q0, q1 @@ -828,7 +828,7 @@ WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon +WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon //Load the pix1 data --- 16 bytes vld1.32 {d0[0]}, [r0], r1 @@ -836,11 +836,11 @@ WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon vld1.32 {d1[0]}, [r0], r1 vld1.32 {d1[1]}, [r0] - //Load the pix2 data --- 16 bytes + //Load the pix2 data --- 16 bytes vld1.32 {d2[0]}, [r2], r3 vld1.32 {d2[1]}, [r2], r3 vld1.32 {d3[0]}, [r2], r3 - vld1.32 {d3[1]}, [r2] + vld1.32 {d3[1]}, [r2] //Get the difference vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7} @@ -861,15 +861,15 @@ WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon vtrn.16 q13, q12 vadd.s16 q15, q13, q12 - //Do the SAD - vabs.s16 q15, q15 + //Do the SAD + vabs.s16 q15, q15 vabd.s16 q14, q13, q12 vadd.u16 q0, q15, q14 vrhadd.u16 d0, d1 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 vmov.u32 r0, d0[0] diff --git a/codec/encoder/core/arm/reconstruct_neon.S b/codec/encoder/core/arm/reconstruct_neon.S old mode 100755 new mode 100644 index 3a5964ae..c4250693 --- a/codec/encoder/core/arm/reconstruct_neon.S +++ b/codec/encoder/core/arm/reconstruct_neon.S @@ -1,1312 +1,1312 @@ -/*! - * \copy - * Copyright (c) 2013, Cisco Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifdef HAVE_NEON -.text -#include "arm_arch_common_macro.S" - -#ifdef APPLE_IOS -.macro LORD_ALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, src*, src_stride - vld1.64 {$0}, [$4,:128], $5 - vld1.64 {$1}, [$4,:128], $5 - vld1.64 {$2}, [$4,:128], $5 - vld1.64 {$3}, [$4,:128], $5 -// } -.endm - -.macro STORE_ALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, dst*, dst_stride - vst1.64 {$0}, [$4,:128], $5 - vst1.64 {$1}, [$4,:128], $5 - vst1.64 {$2}, [$4,:128], $5 - vst1.64 {$3}, [$4,:128], $5 -// } -.endm - -.macro LORD_UNALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, src*, src_stride - vld1.64 {$0}, [$4], $5 - vld1.64 {$1}, [$4], $5 - vld1.64 {$2}, [$4], $5 - vld1.64 {$3}, [$4], $5 -// } -.endm - -.macro STORE_UNALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, dst*, dst_stride - vst1.64 {$0}, [$4], $5 - vst1.64 {$1}, [$4], $5 - vst1.64 {$2}, [$4], $5 - vst1.64 {$3}, [$4], $5 -// } -.endm - -.macro LOAD_4x4_DATA_FOR_DCT -// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride - vld2.16 {$0[0],$1[0]}, [$4], $5 - vld2.16 {$2[0],$3[0]}, [$6], $7 - vld2.16 {$0[1],$1[1]}, [$4], $5 - vld2.16 {$2[1],$3[1]}, [$6], $7 - - vld2.16 {$0[2],$1[2]}, [$4], $5 - vld2.16 {$2[2],$3[2]}, [$6], $7 - vld2.16 {$0[3],$1[3]}, [$4], $5 - vld2.16 {$2[3],$3[3]}, [$6], $7 -// } -.endm - -.macro LOAD_8x8_DATA_FOR_DCT -// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride - vld1.64 {$0}, [$8], r2 - vld1.64 {$4}, [$9], r4 - vld1.64 {$1}, [$8], r2 - vld1.64 {$5}, [$9], r4 - - vld1.64 {$2}, [$8], r2 - vld1.64 {$6}, [$9], r4 - vld1.64 {$3}, [$8], r2 - vld1.64 {$7}, [$9], r4 -// } -.endm - -.macro DCT_ROW_TRANSFORM_TOTAL_16BITS -// { // input: src_d[0]~[3], working: [4]~[7] - vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3]; - vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3]; - vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2]; - vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2]; - - vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1]; - vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1]; - vshl.s16 $1, $7, #1 - vshl.s16 $3, $6, #1 - vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2]; - vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1); -// } -.endm - -.macro MATRIX_TRANSFORM_EACH_16BITS -// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] - vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] -// } -.endm - -.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 - veor.s16 $6, $6 // init 0 , and keep 0; - vaba.s16 $1, $0, $6 // f + abs(coef - 0) - vmull.s16 $7, $2, $4 - vmull.s16 $8, $3, $5 - vshr.s32 $7, #16 - vshr.s32 $8, #16 - vmovn.s32 $2, $7 - vmovn.s32 $3, $8 - - vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 - vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 $6, #1 - vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x -// } -.endm - -.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef; -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 - veor.s16 $6, $6 // init 0 , and keep 0; - vaba.s16 $1, $0, $6 // f + abs(coef - 0) - vmull.s16 $7, $2, $4 - vmull.s16 $8, $3, $5 - vshr.s32 $7, #16 - vshr.s32 $8, #16 - vmovn.s32 $2, $7 - vmovn.s32 $3, $8 - - vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 - vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 $6, #1 - vmax.s16 $9, $2, $3 - vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x -// } -.endm - -.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; -// { // input: coef, ff (dst), mf , working_d (all 0), working_q - vaba.s16 $1, $0, $3 // f + abs(coef - 0) - vmull.s16 $4, $1, $2 // *= mf - vshr.s32 $4, #16 - vmovn.s32 $1, $4 // >> 16 - - vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111 - vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 $3, #1 - vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x -// } -.endm - -.macro DC_ZERO_COUNT_IN_DUALWORD -// { // input: coef, dst_d, working_d (all 0x01) - vceq.s16 $1, $0, #0 - vand.s16 $1, $2 - vpadd.s16 $1, $1, $1 - vpadd.s16 $1, $1, $1 -// } -.endm - -.macro SELECT_MAX_IN_ABS_COEF -// { // input: coef_0, coef_1, max_q (identy to follow two) - vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4 - vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3] - vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] -// } -.endm - -.macro ZERO_COUNT_IN_2_QUARWORD -// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q - vceq.s16 $0, #0 - vceq.s16 $1, #0 - vand.s16 $0, $2 - vand.s16 $1, $2 - - vpadd.s16 $3, $3, $5 - vpadd.s16 $4, $4, $6 - vpadd.s16 $3, $3, $4 // 8-->4 - vpadd.s16 $3, $3, $3 - vpadd.s16 $3, $3, $3 -// } -.endm - -.macro HDM_QUANT_2x2_TOTAL_16BITS -// { // input: src_d[0]~[3], working_d, dst_d - vshr.s64 $1, $0, #32 - vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; - vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; - vtrn.s16 $2, $1 - vtrn.s32 $2, $1 -// } -.endm - -.macro IHDM_4x4_TOTAL_16BITS -// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 - vshr.s64 $1, $0, #32 - vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; - vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; - vtrn.s16 $2, $1 - vrev32.16 $1, $1 - vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; - - vrev64.16 $1, $2 - vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; - vsub.s16 $1, $2, $1 - vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; - vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; -// } -.endm - -.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP -// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; - vmovl.u8 $4,$0 - vmovl.u8 $5,$1 - vadd.s16 $4,$2 - vadd.s16 $5,$3 - vqmovun.s16 $0,$4 - vqmovun.s16 $1,$5 -// } -.endm - -.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS -// { // input: src_d[0]~[3], output: e_d[0]~[3]; - vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2]; - vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2]; - vshr.s16 $6, $1, #1 - vshr.s16 $7, $3, #1 - vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3]; - vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1); -// } -.endm - -.macro TRANSFORM_TOTAL_16BITS // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } -.endm - - -.macro ROW_TRANSFORM_0_STEP -// { // input: src_d[0]~[3], output: e_q[0]~[3]; - vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; - vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3]; - vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3]; -// } -.endm - -.macro ROW_TRANSFORM_1_STEP -// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 - vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; - vshr.s16 $8, $1, #1 - vshr.s16 $9, $3, #1 - vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; - vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); -// } -.endm - -.macro TRANSFORM_4BYTES // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } -.endm - -.macro COL_TRANSFORM_0_STEP -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; - vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } -.endm - -.macro COL_TRANSFORM_1_STEP -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; - vshr.s32 $6, $1, #1 - vshr.s32 $7, $3, #1 - vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } -.endm -#else -.macro LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, src*, src_stride - vld1.64 {\arg0}, [\arg4,:128], \arg5 - vld1.64 {\arg1}, [\arg4,:128], \arg5 - vld1.64 {\arg2}, [\arg4,:128], \arg5 - vld1.64 {\arg3}, [\arg4,:128], \arg5 -// } -.endm - -.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, dst*, dst_stride - vst1.64 {\arg0}, [\arg4,:128], \arg5 - vst1.64 {\arg1}, [\arg4,:128], \arg5 - vst1.64 {\arg2}, [\arg4,:128], \arg5 - vst1.64 {\arg3}, [\arg4,:128], \arg5 -// } -.endm - -.macro LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, src*, src_stride - vld1.64 {\arg0}, [\arg4], \arg5 - vld1.64 {\arg1}, [\arg4], \arg5 - vld1.64 {\arg2}, [\arg4], \arg5 - vld1.64 {\arg3}, [\arg4], \arg5 -// } -.endm - -.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, dst*, dst_stride - vst1.64 {\arg0}, [\arg4], \arg5 - vst1.64 {\arg1}, [\arg4], \arg5 - vst1.64 {\arg2}, [\arg4], \arg5 - vst1.64 {\arg3}, [\arg4], \arg5 -// } -.endm - -.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride - vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5 - vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7 - vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5 - vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7 - - vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5 - vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7 - vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5 - vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7 -// } -.endm - -.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride - vld1.64 {\arg0}, [\arg8], r2 - vld1.64 {\arg4}, [\arg9], r4 - vld1.64 {\arg1}, [\arg8], r2 - vld1.64 {\arg5}, [\arg9], r4 - - vld1.64 {\arg2}, [\arg8], r2 - vld1.64 {\arg6}, [\arg9], r4 - vld1.64 {\arg3}, [\arg8], r2 - vld1.64 {\arg7}, [\arg9], r4 -// } -.endm - -.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_d[0]~[3], working: [4]~[7] - vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3]; - vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3]; - vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2]; - vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2]; - - vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1]; - vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1]; - vshl.s16 \arg1, \arg7, #1 - vshl.s16 \arg3, \arg6, #1 - vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2]; - vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1); -// } -.endm - -.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3 -// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] - vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] -// } -.endm - -.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 - veor.s16 \arg6, \arg6 // init 0 , and keep 0; - vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) - vmull.s16 \arg7, \arg2, \arg4 - vmull.s16 \arg8, \arg3, \arg5 - vshr.s32 \arg7, #16 - vshr.s32 \arg8, #16 - vmovn.s32 \arg2, \arg7 - vmovn.s32 \arg3, \arg8 - - vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 - vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 \arg6, #1 - vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x -// } -.endm - -.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 - veor.s16 \arg6, \arg6 // init 0 , and keep 0; - vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) - vmull.s16 \arg7, \arg2, \arg4 - vmull.s16 \arg8, \arg3, \arg5 - vshr.s32 \arg7, #16 - vshr.s32 \arg8, #16 - vmovn.s32 \arg2, \arg7 - vmovn.s32 \arg3, \arg8 - - vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 - vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 \arg6, #1 - vmax.s16 \arg9, \arg2, \arg3 - vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x -// } -.endm - -.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 -// { // input: coef, ff (dst), mf , working_d (all 0), working_q - vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0) - vmull.s16 \arg4, \arg1, \arg2 // *= mf - vshr.s32 \arg4, #16 - vmovn.s32 \arg1, \arg4 // >> 16 - - vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111 - vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 \arg3, #1 - vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x -// } -.endm - -.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 -// { // input: coef, dst_d, working_d (all 0x01) - vceq.s16 \arg1, \arg0, #0 - vand.s16 \arg1, \arg2 - vpadd.s16 \arg1, \arg1, \arg1 - vpadd.s16 \arg1, \arg1, \arg1 -// } -.endm - -.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4 -// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1 - vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4 - vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3] - vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] -// } -.endm - -.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6 -// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q - vceq.s16 \arg0, #0 - vceq.s16 \arg1, #0 - vand.s16 \arg0, \arg2 - vand.s16 \arg1, \arg2 - - vpadd.s16 \arg3, \arg3, \arg5 - vpadd.s16 \arg4, \arg4, \arg6 - vpadd.s16 \arg3, \arg3, \arg4 // 8-->4 - vpadd.s16 \arg3, \arg3, \arg3 - vpadd.s16 \arg3, \arg3, \arg3 -// } -.endm - -.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 -// { // input: src_d[0]~[3], working_d, dst_d - vshr.s64 \arg1, \arg0, #32 - vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; - vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; - vtrn.s16 \arg2, \arg1 - vtrn.s32 \arg2, \arg1 -// } -.endm - -.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 -// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 - vshr.s64 \arg1, \arg0, #32 - vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; - vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; - vtrn.s16 \arg2, \arg1 - vrev32.16 \arg1, \arg1 - vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; - - vrev64.16 \arg1, \arg2 - vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; - vsub.s16 \arg1, \arg2, \arg1 - vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; - vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; -// } -.endm - -.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; - vmovl.u8 \arg4,\arg0 - vmovl.u8 \arg5,\arg1 - vadd.s16 \arg4,\arg2 - vadd.s16 \arg5,\arg3 - vqmovun.s16 \arg0,\arg4 - vqmovun.s16 \arg1,\arg5 -// } -.endm - -.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_d[0]~[3], output: e_d[0]~[3]; - vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2]; - vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2]; - vshr.s16 \arg6, \arg1, #1 - vshr.s16 \arg7, \arg3, #1 - vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3]; - vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1); -// } -.endm - -.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } -.endm - - -.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_d[0]~[3], output: e_q[0]~[3]; - vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; - vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3]; - vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3]; -// } -.endm - -.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9 - vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; - vshr.s16 \arg8, \arg1, #1 - vshr.s16 \arg9, \arg3, #1 - vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; - vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); -// } -.endm - -.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } -.endm - -.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; - vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } -.endm - -.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; - vshr.s32 \arg6, \arg1, #1 - vshr.s32 \arg7, \arg3, #1 - vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } -.endm -#endif - - -WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon - - LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon - - LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - - STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - - LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 - - STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 - - LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - - STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - - LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 - - STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon - - LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon - - LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon - - LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 - - LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 - - STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 - -WELS_ASM_FUNC_END - - - -WELS_ASM_FUNC_BEGIN WelsDctT4_neon - push {r4} - ldr r4, [sp, #4] - - LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4 - - vsubl.u8 q0, d4, d6 - vsubl.u8 q1, d5, d7 - vtrn.s32 q0, q1 - vswp d1, d2 - - // horizontal transform - DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - - // transform element - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - - // vertical transform - DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - - // transform element - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - - vst1.s16 {q0, q1}, [r0]! - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon - push {r4} - ldr r4, [sp, #4] - - LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3 - - vsubl.u8 q0, d8, d12 - vsubl.u8 q1, d9, d13 - vsubl.u8 q2, d10, d14 - vsubl.u8 q3, d11, d15 - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - - // horizontal transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - // transform element - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - - // vertical transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - vswp d1, d2 - vswp d5, d6 - vswp q1, q2 - vst1.s16 {q0, q1}, [r0]! - vst1.s16 {q2, q3}, [r0]! - - //////////////// - LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3 - - vsubl.u8 q0, d8, d12 - vsubl.u8 q1, d9, d13 - vsubl.u8 q2, d10, d14 - vsubl.u8 q3, d11, d15 - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - - // horizontal transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - // transform element - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - - // vertical transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - vswp d1, d2 - vswp d5, d6 - vswp q1, q2 - vst1.s16 {q0, q1}, [r0]! - vst1.s16 {q2, q3}, [r0]! - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon - vld1.s16 {q2}, [r1] - vld1.s16 {q0, q1}, [r0] - vld1.s16 {q3}, [r2] - - vmov q4, q2 - - NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7 - vst1.s16 {q2}, [r0]! - - NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r0]! - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon - - vld1.s16 {q0, q1}, [r0] - vdup.s16 q2, r1 // even ff range [0, 768] - vdup.s16 q3, r2 - - vmov q4, q2 - - NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7 - vst1.s16 {q2}, [r0]! - - NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r0]! - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon - vld1.s16 {q2}, [r1] - vld1.s16 {q3}, [r2] - mov r1, r0 - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 - vst1.s16 {q4}, [r1]! - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon - vld1.s16 {q2}, [r1] - vld1.s16 {q3}, [r2] - mov r1, r0 - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18 - vst1.s16 {q4}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20 - vst1.s16 {q8}, [r1]! // then 1st 16 elem in d18 & d20 - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19 - vst1.s16 {q4}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21 - vst1.s16 {q8}, [r1]! // then 2nd 16 elem in d19 & d21 - - SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1 - vst1.s32 {d0[0]}, [r3]! - - /////////// - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18 - vst1.s16 {q4}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20 - vst1.s16 {q8}, [r1]! // then 3rd 16 elem in d18 & d20 - - vld1.s16 {q0, q1}, [r0]! - vmov q4, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19 - vst1.s16 {q4}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21 - vst1.s16 {q8}, [r1]! // then 4th 16 elem in d19 & d21 - - SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1 - vst1.s32 {d0[0]}, [r3]! - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon - push {r2,r3} - mov r2, #64 // 2*16*sizeof(int16_t) - add r3, r1, #32 - - vld1.s16 {d0}, [r1], r2 - vld1.s16 {d1}, [r3], r2 - vld1.s16 {d4}, [r1], r2 - vld1.s16 {d5}, [r3], r2 - vld1.s16 {d2}, [r1], r2 - vld1.s16 {d3}, [r3], r2 - vld1.s16 {d6}, [r1], r2 - vld1.s16 {d7}, [r3], r2 - vtrn.16 q0, q2 // d0[0 4], d1[1 5] - vtrn.16 q1, q3 // d2[2 6], d3[3 7] - - vld1.s16 {d8}, [r1], r2 - vld1.s16 {d9}, [r3], r2 - vld1.s16 {d12}, [r1], r2 - vld1.s16 {d13}, [r3], r2 - vld1.s16 {d10}, [r1], r2 - vld1.s16 {d11}, [r3], r2 - vld1.s16 {d14}, [r1], r2 - vld1.s16 {d15}, [r3], r2 - vtrn.16 q4, q6 // d8[08 12], d9[09 13] - vtrn.16 q5, q7 //d10[10 14],d11[11 15] - - vtrn.32 q0, q4 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16] - vtrn.32 q1, q5 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80] - - ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q4, q7, q6, q5 - - TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5 - - // transform element 32bits - vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] - - COL_TRANSFORM_0_STEP q0, q1, q3, q2, q4, q7, q6, q5 - - TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5 - - vrshrn.s32 d8, q0, #1 - vrshrn.s32 d9, q1, #1 - vrshrn.s32 d10, q2, #1 - vrshrn.s32 d11, q3, #1 - vst1.16 {q4, q5}, [r0] //store - - pop {r2,r3} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon - - vdup.s16 d1, r1 //ff - vdup.s16 d2, r2 //mf - veor d3, d3 - - mov r1, #32 - mov r2, r0 - - vld1.s16 {d0[0]}, [r0], r1 //rs[00] - vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0 - vld1.s16 {d0[1]}, [r0], r1 //rs[16] - vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0 - vld1.s16 {d0[2]}, [r0], r1 //rs[32] - vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0 - vld1.s16 {d0[3]}, [r0], r1 //rs[48] - vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0 - - HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5 - - HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0 - - QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2 - - vst1.s16 d1, [r3] // store to dct - ldr r2, [sp, #0] - vst1.s16 d1, [r2] // store to block - - mov r1, #1 - vdup.s16 d3, r1 - DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3 - - vmov r0, r1, d0 - and r0, #0x07 // range [0~4] - rsb r0, #4 -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon - - vdup.s16 d3, r1 - mov r1, #32 - vld1.s16 {d0[0]}, [r0], r1 //rs[00] - vld1.s16 {d0[1]}, [r0], r1 //rs[16] - vld1.s16 {d0[2]}, [r0], r1 //rs[32] - vld1.s16 {d0[3]}, [r0], r1 //rs[48] - - HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2 - - HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0 - - vabs.s16 d1, d0 - vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold; - vmov r0, r1, d1 - orr r0, r1 -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon - push {r1} - vld1.s16 {q0, q1}, [r0] - vmov.s16 q8, #1 - - ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3 - vmov r0, r1, d0 - and r0, #0x1F // range [0~16] - rsb r0, #16 - pop {r1} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon - vld1.s16 {q0, q1}, [r0] - vld1.u16 {q2}, [r1] - - vmul.s16 q4, q0, q2 - vmul.s16 q5, q1, q2 - - vst1.s16 {q4, q5}, [r0] -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon - vld1.u16 {q8}, [r1] - mov r1, r0 - vld1.s16 {q0, q1}, [r0]! - vld1.s16 {q2, q3}, [r0]! - vmul.s16 q0, q0, q8 - vld1.s16 {q4, q5}, [r0]! - vmul.s16 q1, q1, q8 - vld1.s16 {q6, q7}, [r0]! - - vst1.s16 {q0, q1}, [r1]! - - vmul.s16 q2, q2, q8 - vmul.s16 q3, q3, q8 - vmul.s16 q4, q4, q8 - vst1.s16 {q2, q3}, [r1]! - - vmul.s16 q5, q5, q8 - vmul.s16 q6, q6, q8 - vmul.s16 q7, q7, q8 - vst1.s16 {q4, q5}, [r1]! - vst1.s16 {q6, q7}, [r1]! - -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon - - vld1.s16 {q0, q1}, [r0] - vdup.s16 q4, r1 - - IHDM_4x4_TOTAL_16BITS q0, q2, q3 - IHDM_4x4_TOTAL_16BITS q1, q2, q3 - - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - - IHDM_4x4_TOTAL_16BITS q0, q2, q3 - vmul.s16 q0, q4 - - IHDM_4x4_TOTAL_16BITS q1, q2, q3 - vmul.s16 q1, q4 - - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - vst1.s16 {q0, q1}, [r0] -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon - vld1.u32 {d14[0]}, [r2], r3 - push {r4} - ldr r4, [sp, #4] - vld1.u32 {d14[1]}, [r2], r3 - - vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles! - vld1.u32 {d15[0]}, [r2], r3 - vld1.u32 {d15[1]}, [r2], r3 // q7 is pred - - ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - - TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - - ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - - TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - vrshr.s16 d0, d0, #6 - vrshr.s16 d1, d1, #6 - vrshr.s16 d2, d2, #6 - vrshr.s16 d3, d3, #6 - - //after rounding 6, clip into [0, 255] - vmovl.u8 q2,d14 - vadd.s16 q0,q2 - vqmovun.s16 d14,q0 - vst1.32 {d14[0]},[r0],r1 - vst1.32 {d14[1]},[r0],r1 - - vmovl.u8 q2,d15 - vadd.s16 q1,q2 - vqmovun.s16 d15,q1 - vst1.32 {d15[0]},[r0],r1 - vst1.32 {d15[1]},[r0] - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon - - vld1.u64 {d16}, [r2], r3 - push {r4} - ldr r4, [sp, #4] - vld1.u64 {d17}, [r2], r3 - - vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! - vld1.u64 {d18}, [r2], r3 - vld1.u64 {d19}, [r2], r3 - vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! - vswp d1, d4 - vswp d3, d6 - vswp q1, q2 // q0~q3 - - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - vrshr.s16 q0, q0, #6 - vrshr.s16 q1, q1, #6 - vrshr.s16 q2, q2, #6 - vrshr.s16 q3, q3, #6 - - //after rounding 6, clip into [0, 255] - vmovl.u8 q4,d16 - vadd.s16 q0,q4 - vqmovun.s16 d16,q0 - vst1.u8 {d16},[r0],r1 - - vmovl.u8 q4,d17 - vadd.s16 q1,q4 - vqmovun.s16 d17,q1 - vst1.u8 {d17},[r0],r1 - - vmovl.u8 q4,d18 - vadd.s16 q2,q4 - vqmovun.s16 d18,q2 - vst1.u8 {d18},[r0],r1 - - vmovl.u8 q4,d19 - vadd.s16 q3,q4 - vqmovun.s16 d19,q3 - vst1.u8 {d19},[r0],r1 - - vld1.u64 {d16}, [r2], r3 - vld1.u64 {d17}, [r2], r3 - - vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! - vld1.u64 {d18}, [r2], r3 - vld1.u64 {d19}, [r2], r3 - vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! - vswp d1, d4 - vswp d3, d6 - vswp q1, q2 // q0~q3 - - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 - vrshr.s16 q0, q0, #6 - vrshr.s16 q1, q1, #6 - vrshr.s16 q2, q2, #6 - vrshr.s16 q3, q3, #6 - - //after rounding 6, clip into [0, 255] - vmovl.u8 q4,d16 - vadd.s16 q0,q4 - vqmovun.s16 d16,q0 - vst1.u8 {d16},[r0],r1 - - vmovl.u8 q4,d17 - vadd.s16 q1,q4 - vqmovun.s16 d17,q1 - vst1.u8 {d17},[r0],r1 - - vmovl.u8 q4,d18 - vadd.s16 q2,q4 - vqmovun.s16 d18,q2 - vst1.u8 {d18},[r0],r1 - - vmovl.u8 q4,d19 - vadd.s16 q3,q4 - vqmovun.s16 d19,q3 - vst1.u8 {d19},[r0],r1 - - pop {r4} -WELS_ASM_FUNC_END - - -WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon - push {r4} - ldr r4, [sp, #4] - - vld1.s16 {q8,q9}, [r4] - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - - vdup.s16 d20, d16[0] - vdup.s16 d21, d16[1] - vdup.s16 d22, d16[2] - vdup.s16 d23, d16[3] - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vdup.s16 d20, d17[0] - vdup.s16 d21, d17[1] - vdup.s16 d22, d17[2] - vdup.s16 d23, d17[3] - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vdup.s16 d20, d18[0] - vdup.s16 d21, d18[1] - vdup.s16 d22, d18[2] - vdup.s16 d23, d18[3] - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vdup.s16 d20, d19[0] - vdup.s16 d21, d19[1] - vdup.s16 d22, d19[2] - vdup.s16 d23, d19[3] - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 - - pop {r4} -WELS_ASM_FUNC_END -#endif +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON +.text +#include "arm_arch_common_macro.S" + +#ifdef APPLE_IOS +.macro LORD_ALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, src*, src_stride + vld1.64 {$0}, [$4,:128], $5 + vld1.64 {$1}, [$4,:128], $5 + vld1.64 {$2}, [$4,:128], $5 + vld1.64 {$3}, [$4,:128], $5 +// } +.endm + +.macro STORE_ALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, dst*, dst_stride + vst1.64 {$0}, [$4,:128], $5 + vst1.64 {$1}, [$4,:128], $5 + vst1.64 {$2}, [$4,:128], $5 + vst1.64 {$3}, [$4,:128], $5 +// } +.endm + +.macro LORD_UNALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, src*, src_stride + vld1.64 {$0}, [$4], $5 + vld1.64 {$1}, [$4], $5 + vld1.64 {$2}, [$4], $5 + vld1.64 {$3}, [$4], $5 +// } +.endm + +.macro STORE_UNALIGNED_DATA_WITH_STRIDE +// { // input: $0~$3, dst*, dst_stride + vst1.64 {$0}, [$4], $5 + vst1.64 {$1}, [$4], $5 + vst1.64 {$2}, [$4], $5 + vst1.64 {$3}, [$4], $5 +// } +.endm + +.macro LOAD_4x4_DATA_FOR_DCT +// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride + vld2.16 {$0[0],$1[0]}, [$4], $5 + vld2.16 {$2[0],$3[0]}, [$6], $7 + vld2.16 {$0[1],$1[1]}, [$4], $5 + vld2.16 {$2[1],$3[1]}, [$6], $7 + + vld2.16 {$0[2],$1[2]}, [$4], $5 + vld2.16 {$2[2],$3[2]}, [$6], $7 + vld2.16 {$0[3],$1[3]}, [$4], $5 + vld2.16 {$2[3],$3[3]}, [$6], $7 +// } +.endm + +.macro LOAD_8x8_DATA_FOR_DCT +// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride + vld1.64 {$0}, [$8], r2 + vld1.64 {$4}, [$9], r4 + vld1.64 {$1}, [$8], r2 + vld1.64 {$5}, [$9], r4 + + vld1.64 {$2}, [$8], r2 + vld1.64 {$6}, [$9], r4 + vld1.64 {$3}, [$8], r2 + vld1.64 {$7}, [$9], r4 +// } +.endm + +.macro DCT_ROW_TRANSFORM_TOTAL_16BITS +// { // input: src_d[0]~[3], working: [4]~[7] + vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3]; + vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3]; + vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2]; + vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2]; + + vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1]; + vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1]; + vshl.s16 $1, $7, #1 + vshl.s16 $3, $6, #1 + vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2]; + vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1); +// } +.endm + +.macro MATRIX_TRANSFORM_EACH_16BITS +// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] + vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 + veor.s16 $6, $6 // init 0 , and keep 0; + vaba.s16 $1, $0, $6 // f + abs(coef - 0) + vmull.s16 $7, $2, $4 + vmull.s16 $8, $3, $5 + vshr.s32 $7, #16 + vshr.s32 $8, #16 + vmovn.s32 $2, $7 + vmovn.s32 $3, $8 + + vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $6, #1 + vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 + veor.s16 $6, $6 // init 0 , and keep 0; + vaba.s16 $1, $0, $6 // f + abs(coef - 0) + vmull.s16 $7, $2, $4 + vmull.s16 $8, $3, $5 + vshr.s32 $7, #16 + vshr.s32 $8, #16 + vmovn.s32 $2, $7 + vmovn.s32 $3, $8 + + vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $6, #1 + vmax.s16 $9, $2, $3 + vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), mf , working_d (all 0), working_q + vaba.s16 $1, $0, $3 // f + abs(coef - 0) + vmull.s16 $4, $1, $2 // *= mf + vshr.s32 $4, #16 + vmovn.s32 $1, $4 // >> 16 + + vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $3, #1 + vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro DC_ZERO_COUNT_IN_DUALWORD +// { // input: coef, dst_d, working_d (all 0x01) + vceq.s16 $1, $0, #0 + vand.s16 $1, $2 + vpadd.s16 $1, $1, $1 + vpadd.s16 $1, $1, $1 +// } +.endm + +.macro SELECT_MAX_IN_ABS_COEF +// { // input: coef_0, coef_1, max_q (identy to follow two) + vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4 + vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3] + vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] +// } +.endm + +.macro ZERO_COUNT_IN_2_QUARWORD +// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q + vceq.s16 $0, #0 + vceq.s16 $1, #0 + vand.s16 $0, $2 + vand.s16 $1, $2 + + vpadd.s16 $3, $3, $5 + vpadd.s16 $4, $4, $6 + vpadd.s16 $3, $3, $4 // 8-->4 + vpadd.s16 $3, $3, $3 + vpadd.s16 $3, $3, $3 +// } +.endm + +.macro HDM_QUANT_2x2_TOTAL_16BITS +// { // input: src_d[0]~[3], working_d, dst_d + vshr.s64 $1, $0, #32 + vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; + vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; + vtrn.s16 $2, $1 + vtrn.s32 $2, $1 +// } +.endm + +.macro IHDM_4x4_TOTAL_16BITS +// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 + vshr.s64 $1, $0, #32 + vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; + vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; + vtrn.s16 $2, $1 + vrev32.16 $1, $1 + vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; + + vrev64.16 $1, $2 + vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; + vsub.s16 $1, $2, $1 + vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; + vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; +// } +.endm + +.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP +// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; + vmovl.u8 $4,$0 + vmovl.u8 $5,$1 + vadd.s16 $4,$2 + vadd.s16 $5,$3 + vqmovun.s16 $0,$4 + vqmovun.s16 $1,$5 +// } +.endm + +.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS +// { // input: src_d[0]~[3], output: e_d[0]~[3]; + vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2]; + vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2]; + vshr.s16 $6, $1, #1 + vshr.s16 $7, $3, #1 + vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3]; + vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_TOTAL_16BITS // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + + +.macro ROW_TRANSFORM_0_STEP +// { // input: src_d[0]~[3], output: e_q[0]~[3]; + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3]; + vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3]; +// } +.endm + +.macro ROW_TRANSFORM_1_STEP +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 $8, $1, #1 + vshr.s16 $9, $3, #1 + vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_4BYTES // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + +.macro COL_TRANSFORM_0_STEP +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm + +.macro COL_TRANSFORM_1_STEP +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 $6, $1, #1 + vshr.s32 $7, $3, #1 + vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm +#else +.macro LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, src*, src_stride + vld1.64 {\arg0}, [\arg4,:128], \arg5 + vld1.64 {\arg1}, [\arg4,:128], \arg5 + vld1.64 {\arg2}, [\arg4,:128], \arg5 + vld1.64 {\arg3}, [\arg4,:128], \arg5 +// } +.endm + +.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, dst*, dst_stride + vst1.64 {\arg0}, [\arg4,:128], \arg5 + vst1.64 {\arg1}, [\arg4,:128], \arg5 + vst1.64 {\arg2}, [\arg4,:128], \arg5 + vst1.64 {\arg3}, [\arg4,:128], \arg5 +// } +.endm + +.macro LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, src*, src_stride + vld1.64 {\arg0}, [\arg4], \arg5 + vld1.64 {\arg1}, [\arg4], \arg5 + vld1.64 {\arg2}, [\arg4], \arg5 + vld1.64 {\arg3}, [\arg4], \arg5 +// } +.endm + +.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: \arg0~\arg3, dst*, dst_stride + vst1.64 {\arg0}, [\arg4], \arg5 + vst1.64 {\arg1}, [\arg4], \arg5 + vst1.64 {\arg2}, [\arg4], \arg5 + vst1.64 {\arg3}, [\arg4], \arg5 +// } +.endm + +.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride + vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5 + vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7 + vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5 + vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7 + + vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5 + vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7 + vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5 + vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7 +// } +.endm + +.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride + vld1.64 {\arg0}, [\arg8], r2 + vld1.64 {\arg4}, [\arg9], r4 + vld1.64 {\arg1}, [\arg8], r2 + vld1.64 {\arg5}, [\arg9], r4 + + vld1.64 {\arg2}, [\arg8], r2 + vld1.64 {\arg6}, [\arg9], r4 + vld1.64 {\arg3}, [\arg8], r2 + vld1.64 {\arg7}, [\arg9], r4 +// } +.endm + +.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_d[0]~[3], working: [4]~[7] + vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3]; + vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3]; + vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2]; + vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2]; + + vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1]; + vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1]; + vshl.s16 \arg1, \arg7, #1 + vshl.s16 \arg3, \arg6, #1 + vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2]; + vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1); +// } +.endm + +.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3 +// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] + vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 + veor.s16 \arg6, \arg6 // init 0 , and keep 0; + vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) + vmull.s16 \arg7, \arg2, \arg4 + vmull.s16 \arg8, \arg3, \arg5 + vshr.s32 \arg7, #16 + vshr.s32 \arg8, #16 + vmovn.s32 \arg2, \arg7 + vmovn.s32 \arg3, \arg8 + + vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg6, #1 + vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 + veor.s16 \arg6, \arg6 // init 0 , and keep 0; + vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) + vmull.s16 \arg7, \arg2, \arg4 + vmull.s16 \arg8, \arg3, \arg5 + vshr.s32 \arg7, #16 + vshr.s32 \arg8, #16 + vmovn.s32 \arg2, \arg7 + vmovn.s32 \arg3, \arg8 + + vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg6, #1 + vmax.s16 \arg9, \arg2, \arg3 + vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 +// { // input: coef, ff (dst), mf , working_d (all 0), working_q + vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0) + vmull.s16 \arg4, \arg1, \arg2 // *= mf + vshr.s32 \arg4, #16 + vmovn.s32 \arg1, \arg4 // >> 16 + + vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg3, #1 + vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x +// } +.endm + +.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 +// { // input: coef, dst_d, working_d (all 0x01) + vceq.s16 \arg1, \arg0, #0 + vand.s16 \arg1, \arg2 + vpadd.s16 \arg1, \arg1, \arg1 + vpadd.s16 \arg1, \arg1, \arg1 +// } +.endm + +.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4 +// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1 + vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4 + vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3] + vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] +// } +.endm + +.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6 +// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q + vceq.s16 \arg0, #0 + vceq.s16 \arg1, #0 + vand.s16 \arg0, \arg2 + vand.s16 \arg1, \arg2 + + vpadd.s16 \arg3, \arg3, \arg5 + vpadd.s16 \arg4, \arg4, \arg6 + vpadd.s16 \arg3, \arg3, \arg4 // 8-->4 + vpadd.s16 \arg3, \arg3, \arg3 + vpadd.s16 \arg3, \arg3, \arg3 +// } +.endm + +.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 +// { // input: src_d[0]~[3], working_d, dst_d + vshr.s64 \arg1, \arg0, #32 + vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; + vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; + vtrn.s16 \arg2, \arg1 + vtrn.s32 \arg2, \arg1 +// } +.endm + +.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 +// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 + vshr.s64 \arg1, \arg0, #32 + vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; + vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; + vtrn.s16 \arg2, \arg1 + vrev32.16 \arg1, \arg1 + vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; + + vrev64.16 \arg1, \arg2 + vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; + vsub.s16 \arg1, \arg2, \arg1 + vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; + vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; +// } +.endm + +.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5 +// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; + vmovl.u8 \arg4,\arg0 + vmovl.u8 \arg5,\arg1 + vadd.s16 \arg4,\arg2 + vadd.s16 \arg5,\arg3 + vqmovun.s16 \arg0,\arg4 + vqmovun.s16 \arg1,\arg5 +// } +.endm + +.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_d[0]~[3], output: e_d[0]~[3]; + vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2]; + vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2]; + vshr.s16 \arg6, \arg1, #1 + vshr.s16 \arg7, \arg3, #1 + vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3]; + vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + + +.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_d[0]~[3], output: e_q[0]~[3]; + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3]; + vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3]; +// } +.endm + +.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9 + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 \arg8, \arg1, #1 + vshr.s16 \arg9, \arg3, #1 + vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } +.endm + +.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } +.endm + +.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm + +.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 \arg6, \arg1, #1 + vshr.s32 \arg7, \arg3, #1 + vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } +.endm +#endif + + +WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon + + LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + + LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1 + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon + + LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + + LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + +WELS_ASM_FUNC_END + + + +WELS_ASM_FUNC_BEGIN WelsDctT4_neon + push {r4} + ldr r4, [sp, #4] + + LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4 + + vsubl.u8 q0, d4, d6 + vsubl.u8 q1, d5, d7 + vtrn.s32 q0, q1 + vswp d1, d2 + + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + vst1.s16 {q0, q1}, [r0]! + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon + push {r4} + ldr r4, [sp, #4] + + LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3 + + vsubl.u8 q0, d8, d12 + vsubl.u8 q1, d9, d13 + vsubl.u8 q2, d10, d14 + vsubl.u8 q3, d11, d15 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + vswp d1, d2 + vswp d5, d6 + vswp q1, q2 + vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q2, q3}, [r0]! + + //////////////// + LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3 + + vsubl.u8 q0, d8, d12 + vsubl.u8 q1, d9, d13 + vsubl.u8 q2, d10, d14 + vsubl.u8 q3, d11, d15 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + // transform element + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + vswp d1, d2 + vswp d5, d6 + vswp q1, q2 + vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q2, q3}, [r0]! + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon + vld1.s16 {q2}, [r1] + vld1.s16 {q0, q1}, [r0] + vld1.s16 {q3}, [r2] + + vmov q4, q2 + + NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7 + vst1.s16 {q2}, [r0]! + + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r0]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon + + vld1.s16 {q0, q1}, [r0] + vdup.s16 q2, r1 // even ff range [0, 768] + vdup.s16 q3, r2 + + vmov q4, q2 + + NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7 + vst1.s16 {q2}, [r0]! + + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r0]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon + vld1.s16 {q2}, [r1] + vld1.s16 {q3}, [r2] + mov r1, r0 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7 + vst1.s16 {q4}, [r1]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon + vld1.s16 {q2}, [r1] + vld1.s16 {q3}, [r2] + mov r1, r0 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20 + vst1.s16 {q8}, [r1]! // then 1st 16 elem in d18 & d20 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21 + vst1.s16 {q8}, [r1]! // then 2nd 16 elem in d19 & d21 + + SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1 + vst1.s32 {d0[0]}, [r3]! + + /////////// + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20 + vst1.s16 {q8}, [r1]! // then 3rd 16 elem in d18 & d20 + + vld1.s16 {q0, q1}, [r0]! + vmov q4, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19 + vst1.s16 {q4}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21 + vst1.s16 {q8}, [r1]! // then 4th 16 elem in d19 & d21 + + SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1 + vst1.s32 {d0[0]}, [r3]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon + push {r2,r3} + mov r2, #64 // 2*16*sizeof(int16_t) + add r3, r1, #32 + + vld1.s16 {d0}, [r1], r2 + vld1.s16 {d1}, [r3], r2 + vld1.s16 {d4}, [r1], r2 + vld1.s16 {d5}, [r3], r2 + vld1.s16 {d2}, [r1], r2 + vld1.s16 {d3}, [r3], r2 + vld1.s16 {d6}, [r1], r2 + vld1.s16 {d7}, [r3], r2 + vtrn.16 q0, q2 // d0[0 4], d1[1 5] + vtrn.16 q1, q3 // d2[2 6], d3[3 7] + + vld1.s16 {d8}, [r1], r2 + vld1.s16 {d9}, [r3], r2 + vld1.s16 {d12}, [r1], r2 + vld1.s16 {d13}, [r3], r2 + vld1.s16 {d10}, [r1], r2 + vld1.s16 {d11}, [r3], r2 + vld1.s16 {d14}, [r1], r2 + vld1.s16 {d15}, [r3], r2 + vtrn.16 q4, q6 // d8[08 12], d9[09 13] + vtrn.16 q5, q7 //d10[10 14],d11[11 15] + + vtrn.32 q0, q4 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16] + vtrn.32 q1, q5 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80] + + ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q4, q7, q6, q5 + + TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5 + + // transform element 32bits + vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] + + COL_TRANSFORM_0_STEP q0, q1, q3, q2, q4, q7, q6, q5 + + TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5 + + vrshrn.s32 d8, q0, #1 + vrshrn.s32 d9, q1, #1 + vrshrn.s32 d10, q2, #1 + vrshrn.s32 d11, q3, #1 + vst1.16 {q4, q5}, [r0] //store + + pop {r2,r3} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon + + vdup.s16 d1, r1 //ff + vdup.s16 d2, r2 //mf + veor d3, d3 + + mov r1, #32 + mov r2, r0 + + vld1.s16 {d0[0]}, [r0], r1 //rs[00] + vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0 + vld1.s16 {d0[1]}, [r0], r1 //rs[16] + vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0 + vld1.s16 {d0[2]}, [r0], r1 //rs[32] + vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0 + vld1.s16 {d0[3]}, [r0], r1 //rs[48] + vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0 + + HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5 + + HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0 + + QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2 + + vst1.s16 d1, [r3] // store to dct + ldr r2, [sp, #0] + vst1.s16 d1, [r2] // store to block + + mov r1, #1 + vdup.s16 d3, r1 + DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3 + + vmov r0, r1, d0 + and r0, #0x07 // range [0~4] + rsb r0, #4 +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon + + vdup.s16 d3, r1 + mov r1, #32 + vld1.s16 {d0[0]}, [r0], r1 //rs[00] + vld1.s16 {d0[1]}, [r0], r1 //rs[16] + vld1.s16 {d0[2]}, [r0], r1 //rs[32] + vld1.s16 {d0[3]}, [r0], r1 //rs[48] + + HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2 + + HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0 + + vabs.s16 d1, d0 + vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold; + vmov r0, r1, d1 + orr r0, r1 +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon + push {r1} + vld1.s16 {q0, q1}, [r0] + vmov.s16 q8, #1 + + ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3 + vmov r0, r1, d0 + and r0, #0x1F // range [0~16] + rsb r0, #16 + pop {r1} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon + vld1.s16 {q0, q1}, [r0] + vld1.u16 {q2}, [r1] + + vmul.s16 q4, q0, q2 + vmul.s16 q5, q1, q2 + + vst1.s16 {q4, q5}, [r0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon + vld1.u16 {q8}, [r1] + mov r1, r0 + vld1.s16 {q0, q1}, [r0]! + vld1.s16 {q2, q3}, [r0]! + vmul.s16 q0, q0, q8 + vld1.s16 {q4, q5}, [r0]! + vmul.s16 q1, q1, q8 + vld1.s16 {q6, q7}, [r0]! + + vst1.s16 {q0, q1}, [r1]! + + vmul.s16 q2, q2, q8 + vmul.s16 q3, q3, q8 + vmul.s16 q4, q4, q8 + vst1.s16 {q2, q3}, [r1]! + + vmul.s16 q5, q5, q8 + vmul.s16 q6, q6, q8 + vmul.s16 q7, q7, q8 + vst1.s16 {q4, q5}, [r1]! + vst1.s16 {q6, q7}, [r1]! + +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon + + vld1.s16 {q0, q1}, [r0] + vdup.s16 q4, r1 + + IHDM_4x4_TOTAL_16BITS q0, q2, q3 + IHDM_4x4_TOTAL_16BITS q1, q2, q3 + + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + IHDM_4x4_TOTAL_16BITS q0, q2, q3 + vmul.s16 q0, q4 + + IHDM_4x4_TOTAL_16BITS q1, q2, q3 + vmul.s16 q1, q4 + + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + vst1.s16 {q0, q1}, [r0] +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon + vld1.u32 {d14[0]}, [r2], r3 + push {r4} + ldr r4, [sp, #4] + vld1.u32 {d14[1]}, [r2], r3 + + vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles! + vld1.u32 {d15[0]}, [r2], r3 + vld1.u32 {d15[1]}, [r2], r3 // q7 is pred + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + + TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + vrshr.s16 d0, d0, #6 + vrshr.s16 d1, d1, #6 + vrshr.s16 d2, d2, #6 + vrshr.s16 d3, d3, #6 + + //after rounding 6, clip into [0, 255] + vmovl.u8 q2,d14 + vadd.s16 q0,q2 + vqmovun.s16 d14,q0 + vst1.32 {d14[0]},[r0],r1 + vst1.32 {d14[1]},[r0],r1 + + vmovl.u8 q2,d15 + vadd.s16 q1,q2 + vqmovun.s16 d15,q1 + vst1.32 {d15[0]},[r0],r1 + vst1.32 {d15[1]},[r0] + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon + + vld1.u64 {d16}, [r2], r3 + push {r4} + ldr r4, [sp, #4] + vld1.u64 {d17}, [r2], r3 + + vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! + vld1.u64 {d18}, [r2], r3 + vld1.u64 {d19}, [r2], r3 + vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! + vswp d1, d4 + vswp d3, d6 + vswp q1, q2 // q0~q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + vrshr.s16 q2, q2, #6 + vrshr.s16 q3, q3, #6 + + //after rounding 6, clip into [0, 255] + vmovl.u8 q4,d16 + vadd.s16 q0,q4 + vqmovun.s16 d16,q0 + vst1.u8 {d16},[r0],r1 + + vmovl.u8 q4,d17 + vadd.s16 q1,q4 + vqmovun.s16 d17,q1 + vst1.u8 {d17},[r0],r1 + + vmovl.u8 q4,d18 + vadd.s16 q2,q4 + vqmovun.s16 d18,q2 + vst1.u8 {d18},[r0],r1 + + vmovl.u8 q4,d19 + vadd.s16 q3,q4 + vqmovun.s16 d19,q3 + vst1.u8 {d19},[r0],r1 + + vld1.u64 {d16}, [r2], r3 + vld1.u64 {d17}, [r2], r3 + + vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! + vld1.u64 {d18}, [r2], r3 + vld1.u64 {d19}, [r2], r3 + vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! + vswp d1, d4 + vswp d3, d6 + vswp q1, q2 // q0~q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7 + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + vrshr.s16 q2, q2, #6 + vrshr.s16 q3, q3, #6 + + //after rounding 6, clip into [0, 255] + vmovl.u8 q4,d16 + vadd.s16 q0,q4 + vqmovun.s16 d16,q0 + vst1.u8 {d16},[r0],r1 + + vmovl.u8 q4,d17 + vadd.s16 q1,q4 + vqmovun.s16 d17,q1 + vst1.u8 {d17},[r0],r1 + + vmovl.u8 q4,d18 + vadd.s16 q2,q4 + vqmovun.s16 d18,q2 + vst1.u8 {d18},[r0],r1 + + vmovl.u8 q4,d19 + vadd.s16 q3,q4 + vqmovun.s16 d19,q3 + vst1.u8 {d19},[r0],r1 + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon + push {r4} + ldr r4, [sp, #4] + + vld1.s16 {q8,q9}, [r4] + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + + vdup.s16 d20, d16[0] + vdup.s16 d21, d16[1] + vdup.s16 d22, d16[2] + vdup.s16 d23, d16[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vdup.s16 d20, d17[0] + vdup.s16 d21, d17[1] + vdup.s16 d22, d17[2] + vdup.s16 d23, d17[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vdup.s16 d20, d18[0] + vdup.s16 d21, d18[1] + vdup.s16 d22, d18[2] + vdup.s16 d23, d18[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vdup.s16 d20, d19[0] + vdup.s16 d21, d19[1] + vdup.s16 d22, d19[2] + vdup.s16 d23, d19[3] + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 + + pop {r4} +WELS_ASM_FUNC_END +#endif diff --git a/codec/encoder/core/inc/sample.h b/codec/encoder/core/inc/sample.h index fccc9842..48f7eac8 100644 --- a/codec/encoder/core/inc/sample.h +++ b/codec/encoder/core/inc/sample.h @@ -110,6 +110,33 @@ int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int3 #endif//X86_ASM +#if defined (HAVE_NEON) + +int32_t WelsSampleSad4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSad16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSad16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSad8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSad8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t); + +void WelsSampleSadFour16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*); +void WelsSampleSadFour16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*); +void WelsSampleSadFour8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*); +void WelsSampleSadFour8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*); +void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*); + +int32_t WelsSampleSatd8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSatd16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSatd8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSatd16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t); +int32_t WelsSampleSatd4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t); + +int32_t WelsIntra16x16Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*); +int32_t WelsIntra16x16Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*); +int32_t WelsIntra8x8Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*); +int32_t WelsIntra8x8Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*); +int32_t WelsIntra4x4Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t); + +#endif #if defined(__cplusplus) } diff --git a/codec/encoder/core/src/sample.cpp b/codec/encoder/core/src/sample.cpp index 51e74502..38fef9f8 100644 --- a/codec/encoder/core/src/sample.cpp +++ b/codec/encoder/core/src/sample.cpp @@ -482,6 +482,33 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { #endif //(X86_ASM) +#if defined (HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_neon; + pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon; + pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon; + pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon; + pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon; + + pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon; + pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon; + pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon; + pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon; + pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon; + + pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_neon; + pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_neon; + pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon; + pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon; + pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon; + + pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_neon; + pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_neon; + pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_neon; + pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon; + pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon; + } +#endif } } // namespace WelsSVCEnc diff --git a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp index 6f0154ba..efc0ff10 100644 --- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp +++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp @@ -231,6 +231,11 @@ void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag) pfVar = SampleVariance16x16_sse2; } #endif +#ifdef HAVE_NEON + if (iCpuFlag & WELS_CPU_NEON) { + pfVar = SampleVariance16x16_neon; + } +#endif } void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride, diff --git a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h index 205c2971..612e22c3 100644 --- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h +++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h @@ -62,6 +62,11 @@ VarFunc SampleVariance16x16_sse2; WELSVP_EXTERN_C_END #endif +#ifdef HAVE_NEON +WELSVP_EXTERN_C_BEGIN +VarFunc SampleVariance16x16_neon; +WELSVP_EXTERN_C_END +#endif class CAdaptiveQuantization : public IStrategy { public: diff --git a/codec/processing/src/arm/adaptive_quantization.S b/codec/processing/src/arm/adaptive_quantization.S old mode 100755 new mode 100644 index b52fb3ef..9aa4f07b --- a/codec/processing/src/arm/adaptive_quantization.S +++ b/codec/processing/src/arm/adaptive_quantization.S @@ -35,7 +35,7 @@ #include "arm_arch_common_macro.S" #ifdef APPLE_IOS -.macro SQR_ADD_16BYTES +.macro SQR_ADD_16BYTES vmull.u8 q3, $0, $0 vmull.u8 q8, $1, $1 vpadal.u16 $2, q3 @@ -51,23 +51,23 @@ #endif -WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon +WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon stmdb sp!, {r4} vld1.8 {q15}, [r0], r1 //save the ref data (16bytes) vld1.8 {q14}, [r2], r3 //save the src data (16bytes) - - - vabd.u8 q13, q14, q15 + + + vabd.u8 q13, q14, q15 vmull.u8 q12, d27, d27 vmull.u8 q11, d26, d26 vaddl.u16 q12, d24, d25 vpadal.u16 q12, q11 //sqr - vaddl.u8 q13, d26, d27 //sum - + vaddl.u8 q13, d26, d27 //sum + vaddl.u8 q10, d28, d29 //sum_cur - + vmull.u8 q9, d29, d29 vmull.u8 q8, d28, d28 vaddl.u16 q9, d18, d19 //sqr_cur @@ -78,35 +78,35 @@ pixel_var_16x16_loop0: vld1.8 {q0}, [r0], r1 //save the ref data (16bytes) vld1.8 {q1}, [r2], r3 //save the src data (16bytes) - + vabd.u8 q2, q0, q1 - + //q10 save sum_cur vpadal.u8 q10, q1 //q12 save sqr SQR_ADD_16BYTES d4, d5, q12 - + //q13 save sum vpadal.u8 q13, q2 subs r4, #1 - - //q9 save sqr_cur - SQR_ADD_16BYTES d2, d3, q9 - - bne pixel_var_16x16_loop0 - + + //q9 save sqr_cur + SQR_ADD_16BYTES d2, d3, q9 + + bne pixel_var_16x16_loop0 + vadd.u16 d0, d26, d27 //sum - vadd.u16 d1, d20, d21 //sum_cur + vadd.u16 d1, d20, d21 //sum_cur vpaddl.u16 q0, q0 vadd.u32 d2, d24, d25 //sqr vadd.u32 d3, d18, d19 //sqr_cur vpadd.u32 d0, d0, d1 vpadd.u32 d1, d2, d3 - + ldr r4, [sp, #4] - + vshr.u32 q0, q0, #8 vmul.u32 d0, d0 vsub.u32 d0, d1, d0 @@ -117,4 +117,4 @@ pixel_var_16x16_loop0: WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/processing/src/arm/down_sample_neon.S b/codec/processing/src/arm/down_sample_neon.S old mode 100755 new mode 100644 index ff379030..208169ec --- a/codec/processing/src/arm/down_sample_neon.S +++ b/codec/processing/src/arm/down_sample_neon.S @@ -35,29 +35,29 @@ #include "arm_arch_common_macro.S" -WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon +WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon stmdb sp!, {r4-r8, lr} - + //Get the width and height ldr r4, [sp, #24] //src_width ldr r5, [sp, #28] //src_height - + //Initialize the register mov r6, r2 mov r8, r0 mov lr, #0 - lsr r5, #1 - + lsr r5, #1 + //Save the tailer for the unasigned size mla r7, r1, r5, r0 vld1.32 {q15}, [r7] - + add r7, r2, r3 //processing a colume data -comp_ds_bilinear_loop0: +comp_ds_bilinear_loop0: vld1.8 {q0,q1}, [r2]! - vld1.8 {q2,q3}, [r7]! + vld1.8 {q2,q3}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 @@ -70,9 +70,9 @@ comp_ds_bilinear_loop0: vrhadd.u16 q1, q3 vmovn.u16 d0, q0 vmovn.u16 d1, q1 - vst1.32 {q0}, [r0]! + vst1.32 {q0}, [r0]! add lr, #32 - + cmp lr, r4 movcs lr, #0 addcs r6, r3, lsl #1 @@ -82,10 +82,10 @@ comp_ds_bilinear_loop0: movcs r0, r8 subscs r5, #1 bne comp_ds_bilinear_loop0 - + //restore the tailer for the unasigned size vst1.32 {q15}, [r0] - + ldmia sp!, {r4-r8,lr} WELS_ASM_FUNC_END @@ -96,29 +96,29 @@ WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon //Get the width and height ldr r4, [sp, #20] //src_width ldr r5, [sp, #24] //src_height - + //Get the difference - sub lr, r3, r4 + sub lr, r3, r4 sub r1, r1, r4, lsr #1 - + lsr r5, #1 - + //processing a colume data -comp_ds_bilinear_w_x8_loop0: - +comp_ds_bilinear_w_x8_loop0: + lsr r6, r4, #3 add r7, r2, r3 //processing a line data comp_ds_bilinear_w_x8_loop1: - + vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r7]! + vld1.8 {d1}, [r7]! vpaddl.u8 q0, q0 vrshr.u16 q0, #1 vrhadd.u16 d0, d1 - + vmovn.u16 d0, q0 - vst1.32 {d0[0]}, [r0]! + vst1.32 {d0[0]}, [r0]! subs r6, #1 bne comp_ds_bilinear_w_x8_loop1 @@ -126,7 +126,7 @@ comp_ds_bilinear_w_x8_loop1: add r0, r1 subs r5, #1 bne comp_ds_bilinear_w_x8_loop0 - + ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END @@ -137,31 +137,31 @@ WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon //Get the width and height ldr r4, [sp, #20] //src_width ldr r5, [sp, #24] //src_height - + //Get the difference - sub lr, r3, r4 + sub lr, r3, r4 sub r1, r1, r4, lsr #1 - + lsr r5, #1 - + //processing a colume data -comp_ds_bilinear_w_x16_loop0: - +comp_ds_bilinear_w_x16_loop0: + lsr r6, r4, #4 add r7, r2, r3 //processing a line data comp_ds_bilinear_w_x16_loop1: - + vld1.8 {q0}, [r2]! - vld1.8 {q1}, [r7]! + vld1.8 {q1}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vrshr.u16 q0, #1 vrshr.u16 q1, #1 vrhadd.u16 q0, q1 - + vmovn.u16 d0, q0 - vst1.32 {d0}, [r0]! + vst1.32 {d0}, [r0]! subs r6, #1 bne comp_ds_bilinear_w_x16_loop1 @@ -169,34 +169,34 @@ comp_ds_bilinear_w_x16_loop1: add r0, r1 subs r5, #1 bne comp_ds_bilinear_w_x16_loop0 - + ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon +WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon stmdb sp!, {r4-r7, lr} //Get the width and height ldr r4, [sp, #20] //src_width ldr r5, [sp, #24] //src_height - + //Get the difference - sub lr, r3, r4 + sub lr, r3, r4 sub r1, r1, r4, lsr #1 - + lsr r5, #1 - + //processing a colume data -comp_ds_bilinear_w_x32_loop0: - +comp_ds_bilinear_w_x32_loop0: + lsr r6, r4, #5 add r7, r2, r3 //processing a line data comp_ds_bilinear_w_x32_loop1: - + vld1.8 {q0,q1}, [r2]! - vld1.8 {q2,q3}, [r7]! + vld1.8 {q2,q3}, [r7]! vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 @@ -207,10 +207,10 @@ comp_ds_bilinear_w_x32_loop1: vrshr.u16 q3, #1 vrhadd.u16 q0, q2 vrhadd.u16 q1, q3 - + vmovn.u16 d0, q0 vmovn.u16 d1, q1 - vst1.32 {q0}, [r0]! + vst1.32 {q0}, [r0]! subs r6, #1 bne comp_ds_bilinear_w_x32_loop1 @@ -218,14 +218,14 @@ comp_ds_bilinear_w_x32_loop1: add r0, r1 subs r5, #1 bne comp_ds_bilinear_w_x32_loop0 - + ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon +WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon stmdb sp!, {r4-r12, lr} - + //Get the data from stack ldr r4, [sp, #40] //the addr of src ldr r5, [sp, #44] //the value of src_stride @@ -245,11 +245,11 @@ WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon and r9, r7, r10 // r9 vinc(scaleY mod 32767) mov r11, #-1 mul r11, r9 // r11 -vinc - + vdup.s16 d2, r9 vdup.s16 d3, r11 vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc - + mov r11, #0x40000000 mov r12, #0x4000 sub r12, #1 @@ -261,13 +261,13 @@ WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon sub r11, #1 vdup.s16 d9, r11 vext.8 d7, d9, d8, #4 //init v 16384 16384 16383 16383 - - veor q14, q14 - sub r1, r2 // stride - width + + veor q14, q14 + sub r1, r2 // stride - width mov r8, #16384 // yInverse sub r3, #1 - -_HEIGHT: + +_HEIGHT: ldr r4, [sp, #40] //the addr of src mov r11, r8 lsr r11, #15 @@ -275,8 +275,8 @@ _HEIGHT: add r11, r4 // get current row address mov r12, r11 add r12, r5 - - mov r9, #16384 // xInverse + + mov r9, #16384 // xInverse sub r10, r2, #1 vmov.s16 d6, d1 @@ -288,21 +288,21 @@ _WIDTH: add r4, r12,lr vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a; vzip.32 d28, d29 //q14: 000d000c000b000a; - - vmull.u16 q13, d6, d7 //q13: init u * init v + + vmull.u16 q13, d6, d7 //q13: init u * init v vmull.u32 q12, d26,d28 vmlal.u32 q12, d27,d29 vqadd.u64 d24, d24,d25 vrshr.u64 d24, #30 vst1.8 {d24[0]}, [r0]! - add r9, r6 + add r9, r6 vadd.u16 d6, d0 // inc u vshl.u16 d6, #1 vshr.u16 d6, #1 subs r10, #1 bne _WIDTH - + WIDTH_END: lsr r9, #15 add r4,r11,r9 @@ -317,26 +317,26 @@ WIDTH_END: subs r3, #1 bne _HEIGHT -LAST_ROW: +LAST_ROW: ldr r4, [sp, #40] //the addr of src lsr r8, #15 mul r8, r5 - add r4, r8 // get current row address + add r4, r8 // get current row address mov r9, #16384 _LAST_ROW_WIDTH: mov r11, r9 lsr r11, #15 - + add r3, r4,r11 vld1.8 {d0[0]}, [r3] - vst1.8 {d0[0]}, [r0] - add r0, #1 - add r9, r6 + vst1.8 {d0[0]}, [r0] + add r0, #1 + add r9, r6 subs r2, #1 bne _LAST_ROW_WIDTH - + ldmia sp!, {r4-r12, lr} WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/processing/src/arm/pixel_sad_neon.S b/codec/processing/src/arm/pixel_sad_neon.S old mode 100755 new mode 100644 index c7d4a37b..da1010cf --- a/codec/processing/src/arm/pixel_sad_neon.S +++ b/codec/processing/src/arm/pixel_sad_neon.S @@ -35,24 +35,24 @@ #include "arm_arch_common_macro.S" -WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon +WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon stmdb sp!, {lr} //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 + vld1.8 {d0}, [r0], r1 vld1.8 {d1}, [r2], r3 - + //Do the SAD for 8 bytes vabdl.u8 q1, d0, d1 - + mov lr, #7 pixel_sad_8x8_loop0: //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 + vld1.8 {d0}, [r0], r1 vld1.8 {d1}, [r2], r3 subs lr, #1 - + //Do the SAD for 8 bytes vabal.u8 q1, d0, d1 bne pixel_sad_8x8_loop0 @@ -65,4 +65,4 @@ pixel_sad_8x8_loop0: ldmia sp!, {lr} WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/processing/src/arm/vaa_calc_neon.S b/codec/processing/src/arm/vaa_calc_neon.S old mode 100755 new mode 100644 index 9339ec90..6d75deb1 --- a/codec/processing/src/arm/vaa_calc_neon.S +++ b/codec/processing/src/arm/vaa_calc_neon.S @@ -36,29 +36,29 @@ #ifdef APPLE_IOS -.macro ABS_SUB_SUM_16BYTES +.macro ABS_SUB_SUM_16BYTES vld1.32 {q15}, [$0], $2 vld1.32 {q14}, [$1], $2 vabal.u8 $3, d30, d28 vabal.u8 $4, d31, d29 .endm -.macro ABS_SUB_SUM_8x16BYTES +.macro ABS_SUB_SUM_8x16BYTES vld1.32 {q15}, [$0], $2 vld1.32 {q14}, [$1], $2 vabdl.u8 $3, d30, d28 vabdl.u8 $4, d31, d29 - + + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 .endm -.macro SAD_8X16BITS +.macro SAD_8X16BITS vadd.u16 d31, $0, $1 vpaddl.u16 d31, d31 vpaddl.u32 $2, d31 @@ -73,17 +73,17 @@ vabal.u8 \arg4, d31, d29 .endm -.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4 +.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4 vld1.32 {q15}, [\arg0], \arg2 vld1.32 {q14}, [\arg1], \arg2 vabdl.u8 \arg3, d30, d28 vabdl.u8 \arg4, d31, d29 - + + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 .endm @@ -96,67 +96,67 @@ #endif -WELS_ASM_FUNC_BEGIN vaa_calc_sad_neon +WELS_ASM_FUNC_BEGIN VAACalcSad_neon + + stmdb sp!, {r4-r8} - stmdb sp!, {r4-r8} - ldr r4, [sp, #20] //load pic_stride ldr r5, [sp, #28] //load psad8x8 - + //Initial the Q4 register for save the "psadframe" vmov.s64 q4, #0 - + //Get the jump distance to use on loop codes lsl r8, r4, #4 sub r7, r8, #16 //R7 keep the 16*pic_stride-16 sub r8, r2 //R8 keep the 16*pic_stride-pic_width - + vaa_calc_sad_loop0: //R6 keep the pic_width mov r6, r2 - -vaa_calc_sad_loop1: + +vaa_calc_sad_loop1: //Process the 16x16 bytes ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1 ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3 - + //Do the SAD SAD_8X16BITS d0, d1, d0 SAD_8X16BITS d2, d3, d1 SAD_8X16BITS d4, d5, d2 - SAD_8X16BITS d6, d7, d3 - + SAD_8X16BITS d6, d7, d3 + //Write to "psad8x8" buffer - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]! - + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]! + //Adjust the input address sub r0, r7 sub r1, r7 - + subs r6, #16 - - //Save to calculate "psadframe" + + //Save to calculate "psadframe" vadd.u32 q0, q1 vadd.u32 q4, q0 - + bne vaa_calc_sad_loop1 - + //Adjust the input address add r0, r8 add r1, r8 - + subs r3, #16 - bne vaa_calc_sad_loop0 - + bne vaa_calc_sad_loop0 + ldr r6, [sp, #24] //load psadframe vadd.u32 d8, d9 vst1.32 {d8[0]}, [r6] - + ldmia sp!, {r4-r8} - + WELS_ASM_FUNC_END @@ -164,12 +164,12 @@ WELS_ASM_FUNC_END .macro SAD_SD_MAD_16BYTES vld1.32 {q0}, [$0], $2 vld1.32 {q1}, [$1], $2 - + vpadal.u8 $3, q0 vpadal.u8 $4, q1 - - vabd.u8 q0, q0, q1 - vmax.u8 $5, q0 + + vabd.u8 q0, q0, q1 + vmax.u8 $5, q0 vpadal.u8 $6, q0 .endm @@ -177,13 +177,13 @@ WELS_ASM_FUNC_END vld1.32 {q0}, [$0], $2 vld1.32 {q1}, [$1], $2 - vpaddl.u8 q2, q0 + vpaddl.u8 q2, q0 vpaddl.u8 q3, q1 - - vabd.u8 $3, q0, q1 + + vabd.u8 $3, q0, q1 vpaddl.u8 $4, $3 //abs_diff - + SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 @@ -191,7 +191,7 @@ WELS_ASM_FUNC_END SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 - + vsub.u16 $5, q2, q3 .endm @@ -203,18 +203,18 @@ WELS_ASM_FUNC_END vpaddl.u16 $3, $3 vpaddl.u32 $3, $3 vpaddl.s16 $4, $4 - vpaddl.s32 $4, $4 + vpaddl.s32 $4, $4 .endm #else -.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6 +.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6 vld1.32 {q0}, [\arg0], \arg2 vld1.32 {q1}, [\arg1], \arg2 - + vpadal.u8 \arg3, q0 vpadal.u8 \arg4, q1 - - vabd.u8 q0, q0, q1 - vmax.u8 \arg5, q0 + + vabd.u8 q0, q0, q1 + vmax.u8 \arg5, q0 vpadal.u8 \arg6, q0 .endm @@ -222,13 +222,13 @@ WELS_ASM_FUNC_END vld1.32 {q0}, [\arg0], \arg2 vld1.32 {q1}, [\arg1], \arg2 - vpaddl.u8 q2, q0 + vpaddl.u8 q2, q0 vpaddl.u8 q3, q1 - - vabd.u8 \arg3, q0, q1 + + vabd.u8 \arg3, q0, q1 vpaddl.u8 \arg4, \arg3 //abs_diff - + SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 @@ -236,7 +236,7 @@ WELS_ASM_FUNC_END SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 - + vsub.u16 \arg5, q2, q3 .endm @@ -248,69 +248,69 @@ WELS_ASM_FUNC_END vpaddl.u16 \arg3, \arg3 vpaddl.u32 \arg3, \arg3 vpaddl.s16 \arg4, \arg4 - vpaddl.s32 \arg4, \arg4 + vpaddl.s32 \arg4, \arg4 .endm #endif -WELS_ASM_FUNC_BEGIN vaa_calc_sad_bgd_neon +WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon stmdb sp!, {r4-r10} - + ldr r4, [sp, #28] //load pic_stride ldr r5, [sp, #36] //load psad8x8 ldr r6, [sp, #40] //load psd8x8 ldr r7, [sp, #44] //load pmad8x8 - + //Initial the Q4 register for save the "psadframe" vmov.s64 q15, #0 - + //Get the jump distance to use on loop codes lsl r10, r4, #4 sub r9, r10, #16 //R9 keep the 16*pic_stride-16 sub r10, r2 //R10 keep the 16*pic_stride-pic_width - + vaa_calc_sad_bgd_loop0: //R6 keep the pic_width mov r8, r2 - -vaa_calc_sad_bgd_loop1: + +vaa_calc_sad_bgd_loop1: //Process the 16x16 bytes pmad psad psd SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9 SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10 - + SAD_SD_MAD_CALC d26, d27, d16, q11, q9 - SAD_SD_MAD_CALC d28, d29, d17, q12, q10 + SAD_SD_MAD_CALC d28, d29, d17, q12, q10 //Write to "psad8x8" buffer - vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]! + vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]! //Adjust the input address sub r0, r9 sub r1, r9 //Write to "psd8x8" buffer - vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]! + vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]! subs r8, #16 //Write to "pmad8x8" buffer - vst2.16 {d16[0],d17[0]}, [r7]! - //Save to calculate "psadframe" + vst2.16 {d16[0],d17[0]}, [r7]! + //Save to calculate "psadframe" vadd.u32 q11, q12 vadd.u32 q15, q11 - + bne vaa_calc_sad_bgd_loop1 - + //Adjust the input address add r0, r10 add r1, r10 - + subs r3, #16 - bne vaa_calc_sad_bgd_loop0 - + bne vaa_calc_sad_bgd_loop0 + ldr r8, [sp, #32] //load psadframe vadd.u32 d30, d31 - vst1.32 {d30[0]}, [r8] + vst1.32 {d30[0]}, [r8] ldmia sp!, {r4-r10} - + WELS_ASM_FUNC_END @@ -318,7 +318,7 @@ WELS_ASM_FUNC_END .macro SSD_MUL_SUM_16BYTES_RESET vmull.u8 $3, $0, $0 vpaddl.u16 $2, $3 - + vmull.u8 $3, $1, $1 vpadal.u16 $2, $3 .endm @@ -326,71 +326,71 @@ WELS_ASM_FUNC_END .macro SSD_MUL_SUM_16BYTES vmull.u8 $3, $0, $0 vpadal.u16 $2, $3 - + vmull.u8 $3, $1, $1 vpadal.u16 $2, $3 .endm .macro SAD_SSD_BGD_16 vld1.8 {q0}, [$0], $2 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - + vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - + vld1.8 {q1}, [$1], $2 //load ref_row vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //the last row of a 16x16 block .macro SAD_SSD_BGD_16_end vld1.8 {q0}, [$0], $1 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - + vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16 - + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //for the begin of a 8x16 block, use some instructions to reset the register .macro SAD_SSD_BGD_16_RESET_8x8 vld1.8 {q0}, [$0], $2 //load cur_row - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - - + + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - + vld1.8 {q1}, [$1], $2 //load ref_row - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm @@ -398,18 +398,18 @@ WELS_ASM_FUNC_END .macro SAD_SSD_BGD_16_RESET_16x16 vld1.8 {q0}, [$0], $2 //load cur_row vld1.8 {q1}, [$1], $2 //load ref_row - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 - + vld1.8 {q1}, [$1], $2 //load ref_row vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 @@ -419,24 +419,24 @@ WELS_ASM_FUNC_END //for each 8x16 block .macro SAD_SSD_BGD_CALC_8x16 - + vpmax.u8 d10, d10, d11 //4 numbers vpmax.u8 d10, d10, d10 //2 numbers vpmax.u8 d10, d10, d10 //1 number1 - + vmov $0, d10 //d26 d27 keeps the l_mad - + //p_sd8x8 fix me - vpaddl.u16 q3, q3 + vpaddl.u16 q3, q3 vpaddl.u16 q4, q4 - + vsub.i32 $1, q3, q4 vpaddl.u32 $1, $1 - + //psad8x8 vpaddl.u16 $2, $2 vpaddl.u32 $2, $2 - + //psadframe vadd.i32 q12, $2 .endm @@ -451,9 +451,9 @@ WELS_ASM_FUNC_END SAD_SSD_BGD_16 $0, $1, $2, q6 SAD_SSD_BGD_16 $0, $1, $2, q6 SAD_SSD_BGD_16 $0, $1, $2, q6 - + SAD_SSD_BGD_CALC_8x16 d26, q14, q6 - + //for another 8x16 SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7 SAD_SSD_BGD_16 $0, $1, $2, q7 @@ -463,20 +463,20 @@ WELS_ASM_FUNC_END SAD_SSD_BGD_16 $0, $1, $2, q7 SAD_SSD_BGD_16 $0, $1, $2, q7 SAD_SSD_BGD_16_end $0, $2, q7 - + SAD_SSD_BGD_CALC_8x16 d27, q15, q7 .endm -.macro SSD_SAD_SD_MAD_PADDL +.macro SSD_SAD_SD_MAD_PADDL vpaddl.s16 $0, $0 - vpaddl.s32 $0, $0 - vadd.i32 $1, $1, $2 + vpaddl.s32 $0, $0 + vadd.i32 $1, $1, $2 .endm #else .macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3 vmull.u8 \arg3, \arg0, \arg0 vpaddl.u16 \arg2, \arg3 - + vmull.u8 \arg3, \arg1, \arg1 vpadal.u16 \arg2, \arg3 .endm @@ -484,71 +484,71 @@ WELS_ASM_FUNC_END .macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3 vmull.u8 \arg3, \arg0, \arg0 vpadal.u16 \arg2, \arg3 - + vmull.u8 \arg3, \arg1, \arg1 vpadal.u16 \arg2, \arg3 .endm .macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3 vld1.8 {q0}, [\arg0], \arg2 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - + vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - + vld1.8 {q1}, [\arg1], \arg2 //load ref_row vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //the last row of a 16x16 block .macro SAD_SSD_BGD_16_end arg0, arg1, arg2 vld1.8 {q0}, [\arg0], \arg1 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - + vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 - + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //for the begin of a 8x16 block, use some instructions to reset the register .macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3 vld1.8 {q0}, [\arg0], \arg2 //load cur_row - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - - + + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - + vld1.8 {q1}, [\arg1], \arg2 //load ref_row - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm @@ -556,18 +556,18 @@ WELS_ASM_FUNC_END .macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3 vld1.8 {q0}, [\arg0], \arg2 //load cur_row vld1.8 {q1}, [\arg1], \arg2 //load ref_row - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 - + vld1.8 {q1}, [\arg1], \arg2 //load ref_row vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 @@ -577,24 +577,24 @@ WELS_ASM_FUNC_END //for each 8x16 block .macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2 - + vpmax.u8 d10, d10, d11 //4 numbers vpmax.u8 d10, d10, d10 //2 numbers vpmax.u8 d10, d10, d10 //1 number1 - + vmov \arg0, d10 //d26 d27 keeps the l_mad - + //p_sd8x8 - vpaddl.u16 q3, q3 + vpaddl.u16 q3, q3 vpaddl.u16 q4, q4 - + vsub.i32 \arg1, q3, q4 vpaddl.u32 \arg1, \arg1 - + //psad8x8 vpaddl.u16 \arg2, \arg2 vpaddl.u32 \arg2, \arg2 - + //psadframe vadd.i32 q12, \arg2 .endm @@ -609,9 +609,9 @@ WELS_ASM_FUNC_END SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - + SAD_SSD_BGD_CALC_8x16 d26, q14, q6 - + //for another 8x16 SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 @@ -621,160 +621,160 @@ WELS_ASM_FUNC_END SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 SAD_SSD_BGD_16_end \arg0, \arg2, q7 - + SAD_SSD_BGD_CALC_8x16 d27, q15, q7 .endm -.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2 +.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2 vpaddl.s16 \arg0, \arg0 - vpaddl.s32 \arg0, \arg0 - vadd.i32 \arg1, \arg1, \arg2 + vpaddl.s32 \arg0, \arg0 + vadd.i32 \arg1, \arg1, \arg2 .endm #endif -WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_bgd_neon +WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon stmdb sp!, {r0-r12, r14} - + ldr r4, [sp, #56] //r4 keeps the pic_stride - + sub r5, r4, #1 lsl r5, r5, #4 //r5 keeps the little step - + lsl r6, r4, #4 sub r6, r2, r6 //r6 keeps the big step - - + + ldr r8, [sp, #64]//psad8x8 ldr r9, [sp, #68]//psum16x16 ldr r10, [sp, #72]//psqsum16x16 ldr r11, [sp, #76]//psqdiff16x16 ldr r12, [sp, #80]//p_sd8x8 ldr r14, [sp, #84]//p_mad8x8 - + vmov.i8 q12, #0 - + vaa_calc_sad_ssd_bgd_height_loop: mov r7, r2 vaa_calc_sad_ssd_bgd_width_loop: - + //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10 SAD_SSD_BGD_16x16 r0,r1,r4 - + //psad8x8 vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]! - + sub r0, r0, r5 //jump to next 16x16 sub r1, r1, r5 //jump to next 16x16 - + //p_sd8x8 vst4.32 {d28[0], d29[0],d30[0], d31[0]}, [r12]! //p_mad8x8 vst2.16 {d26[0], d27[0]}, [r14]! - + //psqdiff16x16 - vpaddl.s32 q8, q8 + vpaddl.s32 q8, q8 vadd.i32 d16, d16, d17 - + vst1.32 {d16[0]}, [r11]! //psqdiff16x16 - + //psum16x16 SSD_SAD_SD_MAD_PADDL q9, d18, d19 vst1.32 {d18[0]}, [r9]! //psum16x16 //psqsum16x16 - vpaddl.s32 q10, q10 - vadd.i32 d20, d20, d21 + vpaddl.s32 q10, q10 + vadd.i32 d20, d20, d21 vst1.32 {d20[0]}, [r10]! //psqsum16x16 - + subs r7, #16 - + bne vaa_calc_sad_ssd_bgd_width_loop - + sub r0, r0, r6 //jump to next 16 x width sub r1, r1, r6 //jump to next 16 x width - + subs r3, #16 bne vaa_calc_sad_ssd_bgd_height_loop - + //psadframe ldr r7, [sp, #60]//psadframe - + vadd.i32 d24, d24, d25 vst1.32 {d24[0]}, [r7] - + ldmia sp!, {r0-r12, r14} - + WELS_ASM_FUNC_END #ifdef APPLE_IOS .macro SAD_VAR_16 vld1.8 {q0}, [$0], $2 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - + vld1.8 {q1}, [$1], $2 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_END vld1.8 {q0}, [$0], $1 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_RESET_16x16 vld1.8 {q0}, [$0], $2 //load cur_row vld1.8 {q1}, [$1], $2 - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - + vld1.8 {q1}, [$1], $2 - + vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 .endm .macro SAD_VAR_16_RESET_8x8 vld1.8 {q0}, [$0], $2 //load cur_row - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - + vld1.8 {q1}, [$1], $2 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm @@ -788,7 +788,7 @@ WELS_ASM_FUNC_END SAD_VAR_16 $0, $1, $2, q6 SAD_VAR_16 $0, $1, $2, q6 SAD_VAR_16 $0, $1, $2, q6 - + vpaddl.u16 q6, q6 vpaddl.u32 q6, q6 vadd.i32 q12, q6 @@ -802,42 +802,42 @@ WELS_ASM_FUNC_END SAD_VAR_16 $0, $1, $2, q7 SAD_VAR_16 $0, $1, $2, q7 SAD_VAR_16_END $0, $2, q7 - + vpaddl.u16 q7, q7 vpaddl.u32 q7, q7 - + vadd.i32 q12, q7 .endm #else .macro SAD_VAR_16 arg0, arg1, arg2, arg3 vld1.8 {q0}, [\arg0], \arg2 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - + vld1.8 {q1}, [\arg1], \arg2 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_END arg0, arg1, arg2 vld1.8 {q0}, [\arg0], \arg1 //load cur_row - + vpadal.u8 q3, q0 //add cur_row together vpadal.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm @@ -845,35 +845,35 @@ WELS_ASM_FUNC_END .macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3 vld1.8 {q0}, [\arg0], \arg2 //load cur_row vld1.8 {q1}, [\arg1], \arg2 - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - + vld1.8 {q1}, [\arg1], \arg2 - + vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 .endm .macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3 vld1.8 {q0}, [\arg0], \arg2 //load cur_row - + vpaddl.u8 q3, q0 //add cur_row together vpaddl.u8 q4, q1 //add ref_row together - + vabd.u8 q2, q0, q1 //abs_diff - + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - + vld1.8 {q1}, [\arg1], \arg2 - + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm @@ -887,7 +887,7 @@ WELS_ASM_FUNC_END SAD_VAR_16 \arg0, \arg1, \arg2, q6 SAD_VAR_16 \arg0, \arg1, \arg2, q6 SAD_VAR_16 \arg0, \arg1, \arg2, q6 - + vpaddl.u16 q6, q6 vpaddl.u32 q6, q6 vadd.i32 q12, q6 @@ -901,26 +901,26 @@ WELS_ASM_FUNC_END SAD_VAR_16 \arg0, \arg1, \arg2, q7 SAD_VAR_16 \arg0, \arg1, \arg2, q7 SAD_VAR_16_END \arg0, \arg2, q7 - + vpaddl.u16 q7, q7 vpaddl.u32 q7, q7 - + vadd.i32 q12, q7 .endm #endif -WELS_ASM_FUNC_BEGIN vaa_calc_sad_var_neon +WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon stmdb sp!, {r4-r11} - + ldr r4, [sp, #32] //r4 keeps the pic_stride - + sub r5, r4, #1 lsl r5, r5, #4 //r5 keeps the little step - + lsl r6, r4, #4 sub r6, r2, r6 //r6 keeps the big step - + ldr r7, [sp, #36] //psadframe ldr r8, [sp, #40] //psad8x8 ldr r9, [sp, #44] //psum16x16 @@ -936,25 +936,25 @@ vaa_calc_sad_var_width_loop: SAD_VAR_16x16 r0,r1,r4 //psad8x8 vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]! - + sub r0, r0, r5 //jump to next 16x16 sub r1, r1, r5 //jump to next 16x16 - + //psum16x16 SSD_SAD_SD_MAD_PADDL q9, d18, d19 vst1.32 {d18[0]}, [r9]! //psum16x16 - + //psqsum16x16 - vpaddl.s32 q10, q10 + vpaddl.s32 q10, q10 subs r11, #16 - vadd.i32 d20, d20, d21 + vadd.i32 d20, d20, d21 vst1.32 {d20[0]}, [r10]! //psqsum16x16 - + bne vaa_calc_sad_var_width_loop - + sub r0, r0, r6 //jump to next 16 x width sub r1, r1, r6 //jump to next 16 x width - + subs r3, #16 bne vaa_calc_sad_var_height_loop @@ -968,25 +968,25 @@ WELS_ASM_FUNC_END #ifdef APPLE_IOS .macro SAD_SSD_16 SAD_VAR_16 $0, $1, $2, $3 - + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 .endm .macro SAD_SSD_16_END SAD_VAR_16_END $0, $1, $2, $3 - + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_16x16 SAD_VAR_16_RESET_16x16 $0, $1, $2, $3 - + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_8x8 SAD_VAR_16_RESET_8x8 $0, $1, $2, $3 - + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm @@ -1000,7 +1000,7 @@ WELS_ASM_FUNC_END SAD_SSD_16 $0, $1, $2, q6 SAD_SSD_16 $0, $1, $2, q6 SAD_SSD_16 $0, $1, $2, q6 - + vpaddl.u16 q6, q6 vpaddl.u32 q6, q6 vadd.i32 q12, q6 @@ -1014,34 +1014,34 @@ WELS_ASM_FUNC_END SAD_SSD_16 $0, $1, $2, q7 SAD_SSD_16 $0, $1, $2, q7 SAD_SSD_16_END $0, $2, q7 - + vpaddl.u16 q7, q7 vpaddl.u32 q7, q7 - + vadd.i32 q12, q7 .endm #else .macro SAD_SSD_16 arg0, arg1, arg2, arg3 SAD_VAR_16 \arg0, \arg1, \arg2, \arg3 - + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 .endm .macro SAD_SSD_16_END arg0, arg1, arg2, arg3 SAD_VAR_16_END \arg0, \arg1, \arg2, \arg3 - + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3 SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3 - + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3 SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3 - + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm @@ -1055,7 +1055,7 @@ WELS_ASM_FUNC_END SAD_SSD_16 \arg0, \arg1, \arg2, q6 SAD_SSD_16 \arg0, \arg1, \arg2, q6 SAD_SSD_16 \arg0, \arg1, \arg2, q6 - + vpaddl.u16 q6, q6 vpaddl.u32 q6, q6 vadd.i32 q12, q6 @@ -1069,26 +1069,26 @@ WELS_ASM_FUNC_END SAD_SSD_16 \arg0, \arg1, \arg2, q7 SAD_SSD_16 \arg0, \arg1, \arg2, q7 SAD_SSD_16_END \arg0, \arg2, q7 - + vpaddl.u16 q7, q7 vpaddl.u32 q7, q7 - + vadd.i32 q12, q7 .endm #endif -WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_neon +WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon stmdb sp!, {r4-r12} ldr r4, [sp, #36] //r4 keeps the pic_stride - + sub r5, r4, #1 lsl r5, r5, #4 //r5 keeps the little step - + lsl r6, r4, #4 sub r6, r2, r6 //r6 keeps the big step - + ldr r7, [sp, #40] //psadframe ldr r8, [sp, #44] //psad8x8 ldr r9, [sp, #48] //psum16x16 @@ -1105,32 +1105,32 @@ vaa_calc_sad_ssd_width_loop: SAD_SSD_16x16 r0,r1,r4 //psad8x8 vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]! - + sub r0, r0, r5 //jump to next 16x16 sub r1, r1, r5 //jump to next 16x16 - + //psum16x16 vpaddl.s16 q9, q9 - vpaddl.s32 q9, q9 + vpaddl.s32 q9, q9 vadd.i32 d18, d18, d19 vst1.32 {d18[0]}, [r9]! //psum16x16 //psqsum16x16 - vpaddl.s32 q10, q10 - vadd.i32 d20, d20, d21 + vpaddl.s32 q10, q10 + vadd.i32 d20, d20, d21 vst1.32 {d20[0]}, [r10]! //psqsum16x16 - + //psqdiff16x16 - vpaddl.s32 q8, q8 + vpaddl.s32 q8, q8 vadd.i32 d16, d16, d17 subs r12, #16 vst1.32 {d16[0]}, [r11]! //psqdiff16x16 - + bne vaa_calc_sad_ssd_width_loop - + sub r0, r0, r6 //jump to next 16 x width sub r1, r1, r6 //jump to next 16 x width - + subs r3, #16 bne vaa_calc_sad_ssd_height_loop @@ -1140,4 +1140,4 @@ vaa_calc_sad_ssd_width_loop: ldmia sp!, {r4-r12} WELS_ASM_FUNC_END -#endif \ No newline at end of file +#endif diff --git a/codec/processing/src/downsample/downsample.cpp b/codec/processing/src/downsample/downsample.cpp index b92d3057..f58dafec 100644 --- a/codec/processing/src/downsample/downsample.cpp +++ b/codec/processing/src/downsample/downsample.cpp @@ -75,6 +75,16 @@ void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int } #endif//X86_ASM +#if defined(HAVE_NEON) + if (iCpuFlag & WELS_CPU_NEON) { + sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon; + sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon; + sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon; + sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon; + sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon; + sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon; + } +#endif } EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) { diff --git a/codec/processing/src/downsample/downsample.h b/codec/processing/src/downsample/downsample.h index 3570b605..09e9bf50 100644 --- a/codec/processing/src/downsample/downsample.h +++ b/codec/processing/src/downsample/downsample.h @@ -103,7 +103,20 @@ void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDst WELSVP_EXTERN_C_END #endif +#ifdef HAVE_NEON +WELSVP_EXTERN_C_BEGIN +// iSrcWidth no limitation +HalveDownsampleFunc DyadicBilinearDownsampler_neon; +// iSrcWidth = x32 pixels +HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon; +GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon; + +void GeneralBilinearAccurateDownsampler_neon( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, + uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY); + +WELSVP_EXTERN_C_END +#endif class CDownsampling : public IStrategy { diff --git a/codec/processing/src/downsample/downsamplefuncs.cpp b/codec/processing/src/downsample/downsamplefuncs.cpp index 252e9a47..f45848c3 100644 --- a/codec/processing/src/downsample/downsamplefuncs.cpp +++ b/codec/processing/src/downsample/downsamplefuncs.cpp @@ -229,4 +229,14 @@ void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStr //} #endif //X86_ASM +#ifdef HAVE_NEON +void GeneralBilinearAccurateDownsamplerWrap_neon(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, + uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { + const int32_t kiScaleBit = 15; + const uint32_t kuiScale = (1 << kiScaleBit); + uint32_t uiScalex = (uint32_t)((float)kiSrcWidth / (float)kiDstWidth * kuiScale); + uint32_t uiScaley = (uint32_t)((float)kiSrcHeight / (float)kiDstHeight * kuiScale); + GeneralBilinearAccurateDownsampler_neon(pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley); +} +#endif WELSVP_NAMESPACE_END diff --git a/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp index 8182a10f..442cf4cc 100644 --- a/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp +++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp @@ -130,6 +130,12 @@ void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad, int32_t iCpuFlag) pfSad = WelsSampleSad8x8_sse21; } #endif + +#ifdef HAVE_NEON + if (iCpuFlag & WELS_CPU_NEON) { + pfSad = WelsSampleSad8x8_neon; + } +#endif } diff --git a/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h index ad875146..7cf47dbd 100644 --- a/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h +++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h @@ -60,6 +60,12 @@ SadFunc WelsSampleSad8x8_sse21; WELSVP_EXTERN_C_END #endif +#ifdef HAVE_NEON +WELSVP_EXTERN_C_BEGIN +SadFunc WelsSampleSad8x8_neon; +WELSVP_EXTERN_C_END +#endif + WELSVP_NAMESPACE_END #endif diff --git a/codec/processing/src/vaacalc/vaacalculation.cpp b/codec/processing/src/vaacalc/vaacalculation.cpp index cbf7a5ff..8f6b84fe 100644 --- a/codec/processing/src/vaacalc/vaacalculation.cpp +++ b/codec/processing/src/vaacalc/vaacalculation.cpp @@ -65,6 +65,15 @@ void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) { sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2; } #endif//X86_ASM +#ifdef HAVE_NEON + if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) { + sVaaFuncs.pfVAACalcSad = VAACalcSad_neon; + sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_neon; + sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_neon; + sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_neon; + sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_neon; + } +#endif//X86_ASM } EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) { diff --git a/codec/processing/src/vaacalc/vaacalculation.h b/codec/processing/src/vaacalc/vaacalculation.h index ec0ee74e..449dbcf0 100644 --- a/codec/processing/src/vaacalc/vaacalculation.h +++ b/codec/processing/src/vaacalc/vaacalculation.h @@ -103,6 +103,16 @@ VAACalcSadSsdFunc VAACalcSadSsd_sse2; WELSVP_EXTERN_C_END #endif +#ifdef HAVE_NEON +WELSVP_EXTERN_C_BEGIN +VAACalcSadBgdFunc VAACalcSadBgd_neon; +VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_neon; +VAACalcSadFunc VAACalcSad_neon; +VAACalcSadVarFunc VAACalcSadVar_neon; +VAACalcSadSsdFunc VAACalcSadSsd_neon; +WELSVP_EXTERN_C_END +#endif + class CVAACalculation : public IStrategy { public: CVAACalculation (int32_t iCpuFlag);