From b7310e2affd82ce75eee4385fca36275d048f457 Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Mon, 10 Oct 2016 16:15:06 +0530 Subject: [PATCH] Optimize sad_64width_x4d_msa function Reduced HADD_UH_U32 macro calls Change-Id: Ie089b9a443de516646b46e8f72156aa826ca8cfa --- vpx_dsp/mips/macros_msa.h | 37 ++++++++++++++++++++++++++----------- vpx_dsp/mips/sad_msa.c | 24 ++++++++++++++++-------- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index f498fbe9d..8538f7f2b 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -909,27 +909,42 @@ sum_m; \ }) -/* Description : Horizontal addition of 8 unsigned halfword elements - Arguments : Inputs - in (unsigned halfword vector) - Outputs - sum_m (u32 sum) - Return Type - unsigned word - Details : 8 unsigned halfword elements of input vector are added - together and the resulting integer sum is returned +/* Description : Horizontal addition of 4 unsigned word elements + Arguments : Input - in (unsigned word vector) + Output - sum_m (u32 sum) + Return Type - unsigned word (GP) + Details : 4 unsigned word elements of 'in' vector are added together and + the resulting integer sum is returned */ -#define HADD_UH_U32(in) \ +#define HADD_UW_U32(in) \ ({ \ - v4u32 res_m; \ v2u64 res0_m, res1_m; \ uint32_t sum_m; \ \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - res0_m = __msa_hadd_u_d(res_m, res_m); \ + res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m = res0_m + res1_m; \ + res0_m += res1_m; \ sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ sum_m; \ }) +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Input - in (unsigned halfword vector) + Output - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of 'in' vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + sum_m = HADD_UW_U32(res_m); \ + sum_m; \ + }) + /* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c index 6455814e1..e295123ac 100644 --- a/vpx_dsp/mips/sad_msa.c +++ b/vpx_dsp/mips/sad_msa.c @@ -1030,6 +1030,7 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, v8u16 sad2_1 = { 0 }; v8u16 sad3_0 = { 0 }; v8u16 sad3_1 = { 0 }; + v4u32 sad; ref0_ptr = aref_ptr[0]; ref1_ptr = aref_ptr[1]; @@ -1061,14 +1062,21 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); } - sad_array[0] = HADD_UH_U32(sad0_0); - sad_array[0] += HADD_UH_U32(sad0_1); - sad_array[1] = HADD_UH_U32(sad1_0); - sad_array[1] += HADD_UH_U32(sad1_1); - sad_array[2] = HADD_UH_U32(sad2_0); - sad_array[2] += HADD_UH_U32(sad2_1); - sad_array[3] = HADD_UH_U32(sad3_0); - sad_array[3] += HADD_UH_U32(sad3_1); + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_UW_U32(sad); } static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,