From 9966cc8d1269c8ac105b08d9332727bdb5c110bb Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Wed, 29 Nov 2017 11:58:38 +0800 Subject: [PATCH 1/2] vp8: [loongson] optimize vp8_short_fdct4x4_mmi v2. Optimize the calculate process of a,b,c,d. Change-Id: I81717e47bc988ace1412d478513e7dd3cb6b0cc9 --- vp8/encoder/mips/mmi/dct_mmi.c | 135 ++++++++++++++++----------------- 1 file changed, 67 insertions(+), 68 deletions(-) diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c index 7e45a1278..1f60a692d 100644 --- a/vp8/encoder/mips/mmi/dct_mmi.c +++ b/vp8/encoder/mips/mmi/dct_mmi.c @@ -13,9 +13,13 @@ #include "vpx_ports/asmdefs_mmi.h" /* clang-format off */ +/* TRANSPOSE_4H: transpose 4x4 matrix. + Input: ftmp1,ftmp2,ftmp3,ftmp4 + Output: ftmp1,ftmp2,ftmp3,ftmp4 + Note: ftmp0 always be 0, ftmp5~9 used for temporary value. + */ #define TRANSPOSE_4H \ MMI_LI(%[tmp0], 0x93) \ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ "mtc1 %[tmp0], %[ftmp10] \n\t" \ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ @@ -40,8 +44,8 @@ /* clang-format on */ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { - int pitch_half = pitch / 2; uint64_t tmp[1]; + int16_t *ip = input; #if _MIPS_SIM == _ABIO32 register double ftmp0 asm("$f0"); @@ -81,52 +85,51 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; - - DECLARE_ALIGNED(16, int, a[4]); - DECLARE_ALIGNED(16, int, b[4]); - DECLARE_ALIGNED(16, int, c[4]); - DECLARE_ALIGNED(16, int, d[4]); - - // stage1 - a[0] = (input[0] + input[3]) * 8; - a[1] = (input[0 + pitch_half] + input[3 + pitch_half]) * 8; - a[2] = (input[0 + 2 * pitch_half] + input[3 + 2 * pitch_half]) * 8; - a[3] = (input[0 + 3 * pitch_half] + input[3 + 3 * pitch_half]) * 8; - - b[0] = (input[1] + input[2]) * 8; - b[1] = (input[1 + pitch_half] + input[2 + pitch_half]) * 8; - b[2] = (input[1 + 2 * pitch_half] + input[2 + 2 * pitch_half]) * 8; - b[3] = (input[1 + 3 * pitch_half] + input[2 + 3 * pitch_half]) * 8; - - c[0] = (input[1] - input[2]) * 8; - c[1] = (input[1 + pitch_half] - input[2 + pitch_half]) * 8; - c[2] = (input[1 + 2 * pitch_half] - input[2 + 2 * pitch_half]) * 8; - c[3] = (input[1 + 3 * pitch_half] - input[2 + 3 * pitch_half]) * 8; - - d[0] = (input[0] - input[3]) * 8; - d[1] = (input[0 + pitch_half] - input[3 + pitch_half]) * 8; - d[2] = (input[0 + 2 * pitch_half] - input[3 + 2 * pitch_half]) * 8; - d[3] = (input[0 + 3 * pitch_half] - input[3 + 3 * pitch_half]) * 8; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; __asm__ volatile ( - "gslqc1 %[ftmp2], %[ftmp1], 0x00(%[a]) \n\t" - "gslqc1 %[ftmp4], %[ftmp3], 0x00(%[b]) \n\t" - "gslqc1 %[ftmp6], %[ftmp5], 0x00(%[c]) \n\t" - "gslqc1 %[ftmp8], %[ftmp7], 0x00(%[d]) \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + TRANSPOSE_4H - "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" - "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" - "psubw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" - "psubw %[ftmp12], %[ftmp2], %[ftmp4] \n\t" - "packsswh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" - "packsswh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" - "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" - "packsswh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + "ldc1 %[ftmp11], %[ff_ph_8] \n\t" + // f1 + f4 + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // a1 + "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + // f2 + f3 + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + // b1 + "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + // f2 - f3 + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c1 + "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + // f1 - f4 + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + // d1 + "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + // op[0] = a1 + b1 + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // op[2] = a1 - b1 + "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + + // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 MMI_LI(%[tmp0], 0x0c) - "mov.d %[ftmp7], %[ftmp2] \n\t" - "mov.d %[ftmp8], %[ftmp4] \n\t" "mtc1 %[tmp0], %[ftmp11] \n\t" - "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" @@ -138,6 +141,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12 "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" @@ -150,7 +154,6 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" TRANSPOSE_4H - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" @@ -163,17 +166,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" "ldc1 %[ftmp9], %[ff_ph_07] \n\t" - MMI_LI(%[tmp0], 0x04) "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_LI(%[tmp0], 0x04) "mtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" MMI_LI(%[tmp0], 0x10) - "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" - + "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" @@ -196,31 +198,28 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" + "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" + "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" + : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), - [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]) - : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), [a] "r"(a), - [b] "r"(b), [c] "r"(c), [d] "r"(d), [ff_ph_op1] "f"(ff_ph_op1), - [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500), - [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000), - [ff_pw_51000] "m"(ff_pw_51000) + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip) + : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), + [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3), + [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), + [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), + [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), + [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) + : "memory" ); - - __asm__ volatile( - "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" - "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" - "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" - "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" - "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" - "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" - "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" - "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" - : - : [ftmp1] "f"(ftmp1), [ftmp2] "f"(ftmp2), [ftmp3] "f"(ftmp3), - [ftmp4] "f"(ftmp4), [output] "r"(output) - : "memory"); } void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { @@ -277,7 +276,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" - "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" TRANSPOSE_4H From d49bf26b1c076d167cc9488faaa7d766bd5120a3 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Wed, 29 Nov 2017 16:59:22 +0800 Subject: [PATCH 2/2] vp8: [loongson] optimize regular quantize v2. 1. Optimize the memset with mmi. 2. Optimize macro REGULAR_SELECT_EOB. Change-Id: Icd9c866b0e6aef08874b2f123e9b0e09919445ff --- vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 77 ++++++++++++++++--------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c index 22b12bbab..3ccb196ff 100644 --- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c +++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -18,13 +18,14 @@ z = coeff_ptr[rc]; \ sz = (z >> 31); \ x = (z ^ sz) - sz; \ - if (x >= (zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value)) { \ + zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \ + if (x >= zbin) { \ x += round_ptr[rc]; \ y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \ - x = (y ^ sz) - sz; \ - qcoeff_ptr[rc] = x; \ - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \ if (y) { \ + x = (y ^ sz) - sz; \ + qcoeff_ptr[rc] = x; \ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \ eob = i; \ zbin_boost_ptr = b->zrun_zbin_boost; \ } \ @@ -198,8 +199,8 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { } void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { - int eob; - int x, y, z, sz; + int eob = 0; + int x, y, z, sz, zbin; const int16_t *zbin_boost_ptr = b->zrun_zbin_boost; const int16_t *coeff_ptr = b->coeff; const int16_t *zbin_ptr = b->zbin; @@ -210,28 +211,52 @@ void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { int16_t *dqcoeff_ptr = d->dqcoeff; const int16_t *dequant_ptr = d->dequant; const int16_t zbin_oq_value = b->zbin_extra; + register double ftmp0 asm("$f0"); - memset(qcoeff_ptr, 0, 32); - memset(dqcoeff_ptr, 0, 32); + // memset(qcoeff_ptr, 0, 32); + // memset(dqcoeff_ptr, 0, 32); + /* clang-format off */ + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t" - eob = -1; + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp0) + : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr) + : "memory" + ); + /* clang-format on */ - REGULAR_SELECT_EOB(0, 0); - REGULAR_SELECT_EOB(1, 1); - REGULAR_SELECT_EOB(2, 4); - REGULAR_SELECT_EOB(3, 8); - REGULAR_SELECT_EOB(4, 5); - REGULAR_SELECT_EOB(5, 2); - REGULAR_SELECT_EOB(6, 3); - REGULAR_SELECT_EOB(7, 6); - REGULAR_SELECT_EOB(8, 9); - REGULAR_SELECT_EOB(9, 12); - REGULAR_SELECT_EOB(10, 13); - REGULAR_SELECT_EOB(11, 10); - REGULAR_SELECT_EOB(12, 7); - REGULAR_SELECT_EOB(13, 11); - REGULAR_SELECT_EOB(14, 14); - REGULAR_SELECT_EOB(15, 15); + REGULAR_SELECT_EOB(1, 0); + REGULAR_SELECT_EOB(2, 1); + REGULAR_SELECT_EOB(3, 4); + REGULAR_SELECT_EOB(4, 8); + REGULAR_SELECT_EOB(5, 5); + REGULAR_SELECT_EOB(6, 2); + REGULAR_SELECT_EOB(7, 3); + REGULAR_SELECT_EOB(8, 6); + REGULAR_SELECT_EOB(9, 9); + REGULAR_SELECT_EOB(10, 12); + REGULAR_SELECT_EOB(11, 13); + REGULAR_SELECT_EOB(12, 10); + REGULAR_SELECT_EOB(13, 7); + REGULAR_SELECT_EOB(14, 11); + REGULAR_SELECT_EOB(15, 14); + REGULAR_SELECT_EOB(16, 15); - *d->eob = (char)(eob + 1); + *d->eob = (char)eob; }