From f2ad523461088526f9a8c1c028e1b006976ecd87 Mon Sep 17 00:00:00 2001 From: Shiyou Yin <yinshiyou-hf@loongson.cn> Date: Thu, 14 Dec 2017 16:29:58 +0800 Subject: [PATCH] vp8: [loongson] optimize sixtab predict v2. 1. Delete unnecessary zero setting process. 2. Optimize the method of calculating SSE in vpx_varianceWxH. Change-Id: I8bab801416e7f4958c28c6d080e3cf785a50f82b --- vp8/common/mips/mmi/sixtap_filter_mmi.c | 122 ++++++++++++++---------- 1 file changed, 70 insertions(+), 52 deletions(-) diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c index 1b41a4296..77d665d45 100644 --- a/vp8/common/mips/mmi/sixtap_filter_mmi.c +++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -86,6 +86,7 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, register double ftmp8 asm("$f18"); register double ftmp9 asm("$f20"); register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); #else register double fzero asm("$f0"); register double ftmp0 asm("$f1"); @@ -99,6 +100,7 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, register double ftmp8 asm("$f9"); register double ftmp9 asm("$f10"); register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); #endif // _MIPS_SIM == _ABIO32 __asm__ volatile ( @@ -112,11 +114,13 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, "li %[tmp0], 0x07 \n\t" "mtc1 %[tmp0], %[ftmp7] \n\t" "li %[tmp0], 0x08 \n\t" - "mtc1 %[tmp0], %[ftmp10] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" "1: \n\t" "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" - "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t" "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" @@ -125,24 +129,21 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - "gsldlc1 %[ftmp9], 0x06(%[src_ptr]) \n\t" - "gsldrc1 %[ftmp9], -0x01(%[src_ptr]) \n\t" - - "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t" "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - "dsrl %[ftmp9], %[ftmp9], %[ftmp10] \n\t" - "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - "dsrl %[ftmp9], %[ftmp9], %[ftmp10] \n\t" - "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" @@ -163,8 +164,9 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), - [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), - [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [src_ptr]"+&r"(src_ptr) : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width), [ff_ph_40]"f"(ff_ph_40) @@ -190,6 +192,11 @@ static INLINE void vp8_filter_block1dc_v6_mmi( register double ftmp6 asm("$f14"); register double ftmp7 asm("$f16"); register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); + register double ftmp12 asm("$f26"); + register double ftmp13 asm("$f28"); #else register double fzero asm("$f0"); register double ftmp0 asm("$f1"); @@ -201,6 +208,11 @@ static INLINE void vp8_filter_block1dc_v6_mmi( register double ftmp6 asm("$f7"); register double ftmp7 asm("$f8"); register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); + register double ftmp12 asm("$f13"); + register double ftmp13 asm("$f14"); #endif // _MIPS_SIM == _ABIO32 __asm__ volatile ( @@ -210,52 +222,56 @@ static INLINE void vp8_filter_block1dc_v6_mmi( "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" - MMI_SUBU(%[src_ptr], %[src_ptr], %[pixels_per_line_x2]) "xor %[fzero], %[fzero], %[fzero] \n\t" "li %[tmp0], 0x07 \n\t" - "mtc1 %[tmp0], %[ftmp7] \n\t" + "mtc1 %[tmp0], %[ftmp13] \n\t" + /* In order to make full use of memory load delay slot, + * Operation of memory loading and calculating has been rearranged. + */ "1: \n\t" "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" - "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" - MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) - "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" - "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" - "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" - "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) - "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" - "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" - "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" - "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t" MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) - "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" - "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" - "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" - "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - + "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t" MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) - "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" - "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" - "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" - "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" - + "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t" MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) - "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" - "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" - "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" - "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t" - "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" - "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" - "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" - "gsswlc1 %[ftmp8], 0x03(%[output_ptr]) \n\t" - "gsswrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t" + + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + + "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t" + + "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t" + + "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t" + + "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t" + "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t" MMI_ADDIU(%[output_height], %[output_height], -0x01) MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) @@ -265,9 +281,11 @@ static INLINE void vp8_filter_block1dc_v6_mmi( [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), - [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), - [src_ptr]"+&r"(src_ptr), [output_ptr]"+&r"(output_ptr), - [output_height]"+&r"(output_height) + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), + [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) : [pixels_per_line]"r"((mips_reg)pixels_per_line), [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), @@ -301,6 +319,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( "1: \n\t" "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" @@ -308,7 +327,6 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi( "addiu %[output_height], %[output_height], -0x01 \n\t" MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) - MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) "bnez %[output_height], 1b \n\t" : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), @@ -338,12 +356,12 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( "1: \n\t" "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" - MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) - MMI_ADDIU(%[output_height], %[output_height], -0x01) MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) "bnez %[output_height], 1b \n\t" : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), @@ -386,7 +404,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi( } \ } else { \ for (i = 0; i < loop; ++i) { \ - vp8_filter_block1dc_v6_mmi(FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, \ + vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \ dst_pitch, n * 2, VFilter); \ } \ } \