Merge "Optimize sad_64width_x4d_msa function"
This commit is contained in:
commit
8ff5af773a
@ -909,27 +909,42 @@
|
|||||||
sum_m; \
|
sum_m; \
|
||||||
})
|
})
|
||||||
|
|
||||||
/* Description : Horizontal addition of 8 unsigned halfword elements
|
/* Description : Horizontal addition of 4 unsigned word elements
|
||||||
Arguments : Inputs - in (unsigned halfword vector)
|
Arguments : Input - in (unsigned word vector)
|
||||||
Outputs - sum_m (u32 sum)
|
Output - sum_m (u32 sum)
|
||||||
Return Type - unsigned word
|
Return Type - unsigned word (GP)
|
||||||
Details : 8 unsigned halfword elements of input vector are added
|
Details : 4 unsigned word elements of 'in' vector are added together and
|
||||||
together and the resulting integer sum is returned
|
the resulting integer sum is returned
|
||||||
*/
|
*/
|
||||||
#define HADD_UH_U32(in) \
|
#define HADD_UW_U32(in) \
|
||||||
({ \
|
({ \
|
||||||
v4u32 res_m; \
|
|
||||||
v2u64 res0_m, res1_m; \
|
v2u64 res0_m, res1_m; \
|
||||||
uint32_t sum_m; \
|
uint32_t sum_m; \
|
||||||
\
|
\
|
||||||
res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
|
res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \
|
||||||
res0_m = __msa_hadd_u_d(res_m, res_m); \
|
|
||||||
res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
|
res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
|
||||||
res0_m = res0_m + res1_m; \
|
res0_m += res1_m; \
|
||||||
sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
|
sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
|
||||||
sum_m; \
|
sum_m; \
|
||||||
})
|
})
|
||||||
|
|
||||||
|
/* Description : Horizontal addition of 8 unsigned halfword elements
|
||||||
|
Arguments : Input - in (unsigned halfword vector)
|
||||||
|
Output - sum_m (u32 sum)
|
||||||
|
Return Type - unsigned word
|
||||||
|
Details : 8 unsigned halfword elements of 'in' vector are added
|
||||||
|
together and the resulting integer sum is returned
|
||||||
|
*/
|
||||||
|
#define HADD_UH_U32(in) \
|
||||||
|
({ \
|
||||||
|
v4u32 res_m; \
|
||||||
|
uint32_t sum_m; \
|
||||||
|
\
|
||||||
|
res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
|
||||||
|
sum_m = HADD_UW_U32(res_m); \
|
||||||
|
sum_m; \
|
||||||
|
})
|
||||||
|
|
||||||
/* Description : Horizontal addition of unsigned byte vector elements
|
/* Description : Horizontal addition of unsigned byte vector elements
|
||||||
Arguments : Inputs - in0, in1
|
Arguments : Inputs - in0, in1
|
||||||
Outputs - out0, out1
|
Outputs - out0, out1
|
||||||
|
@ -1030,6 +1030,7 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
|
|||||||
v8u16 sad2_1 = { 0 };
|
v8u16 sad2_1 = { 0 };
|
||||||
v8u16 sad3_0 = { 0 };
|
v8u16 sad3_0 = { 0 };
|
||||||
v8u16 sad3_1 = { 0 };
|
v8u16 sad3_1 = { 0 };
|
||||||
|
v4u32 sad;
|
||||||
|
|
||||||
ref0_ptr = aref_ptr[0];
|
ref0_ptr = aref_ptr[0];
|
||||||
ref1_ptr = aref_ptr[1];
|
ref1_ptr = aref_ptr[1];
|
||||||
@ -1061,14 +1062,21 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
|
|||||||
sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
|
sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
|
||||||
}
|
}
|
||||||
|
|
||||||
sad_array[0] = HADD_UH_U32(sad0_0);
|
sad = __msa_hadd_u_w(sad0_0, sad0_0);
|
||||||
sad_array[0] += HADD_UH_U32(sad0_1);
|
sad += __msa_hadd_u_w(sad0_1, sad0_1);
|
||||||
sad_array[1] = HADD_UH_U32(sad1_0);
|
sad_array[0] = HADD_UW_U32(sad);
|
||||||
sad_array[1] += HADD_UH_U32(sad1_1);
|
|
||||||
sad_array[2] = HADD_UH_U32(sad2_0);
|
sad = __msa_hadd_u_w(sad1_0, sad1_0);
|
||||||
sad_array[2] += HADD_UH_U32(sad2_1);
|
sad += __msa_hadd_u_w(sad1_1, sad1_1);
|
||||||
sad_array[3] = HADD_UH_U32(sad3_0);
|
sad_array[1] = HADD_UW_U32(sad);
|
||||||
sad_array[3] += HADD_UH_U32(sad3_1);
|
|
||||||
|
sad = __msa_hadd_u_w(sad2_0, sad2_0);
|
||||||
|
sad += __msa_hadd_u_w(sad2_1, sad2_1);
|
||||||
|
sad_array[2] = HADD_UW_U32(sad);
|
||||||
|
|
||||||
|
sad = __msa_hadd_u_w(sad3_0, sad3_0);
|
||||||
|
sad += __msa_hadd_u_w(sad3_1, sad3_1);
|
||||||
|
sad_array[3] = HADD_UW_U32(sad);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user