diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 46e463dfc..1cfda5749 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -192,4 +192,12 @@ INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest, ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon, &vp8_fast_quantize_b_c))); #endif // HAVE_NEON + +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P( + MSA, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c))); +#endif // HAVE_MSA } // namespace diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index b533cc696..0ed94cd43 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -553,6 +553,20 @@ } #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) +/* Description : Shuffle halfword vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : halfword elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ +} +#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) + /* Description : Dot product of byte vector elements Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 @@ -604,6 +618,31 @@ } #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ +} + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, \ + out0, out1, out2, out3) \ +{ \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + /* Description : Dot product & addition of byte vector elements Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 @@ -1309,6 +1348,18 @@ ADD2(in4, in5, in6, in7, out2, out3); \ } +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ +} + /* Description : Sign extend halfword elements from right half of the vector Arguments : Input - in (halfword vector) Output - out (sign extended word vector) diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index ad7429a54..3cf4f9801 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -286,10 +286,10 @@ $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6; # Quantizer # add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_regular_quantize_b sse2 sse4_1/; +specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/; add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_fast_quantize_b sse2 ssse3 neon/; +specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/; # # Block subtraction diff --git a/vp8/encoder/mips/msa/quantize_msa.c b/vp8/encoder/mips/msa/quantize_msa.c new file mode 100644 index 000000000..0f97646b5 --- /dev/null +++ b/vp8/encoder/mips/msa/quantize_msa.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" +#include "vp8/encoder/block.h" + +static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin, + int16_t *round, int16_t *quant, + int16_t *de_quant, int16_t *q_coeff, + int16_t *dq_coeff) +{ + int32_t cnt, eob; + v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, + 3, 8, 11, 13, 9, 10, 14, 15 }; + v8i16 round0, round1; + v8i16 sign_z0, sign_z1; + v8i16 q_coeff0, q_coeff1; + v8i16 x0, x1, de_quant0, de_quant1; + v8i16 coeff0, coeff1, z0, z1; + v8i16 quant0, quant1, quant2, quant3; + v8i16 zero = { 0 }; + v8i16 inv_zig_zag0, inv_zig_zag1; + v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; + v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; + v8i16 temp0_h, temp1_h, temp2_h, temp3_h; + v4i32 temp0_w, temp1_w, temp2_w, temp3_w; + + ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1); + eob = -1; + LD_SH2(coeff_ptr, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + z0, z1); + LD_SH2(round, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + round0, round1); + LD_SH2(quant, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + quant0, quant2); + sign_z0 = z0 >> 15; + sign_z1 = z1 >> 15; + x0 = __msa_add_a_h(z0, zero); + x1 = __msa_add_a_h(z1, zero); + ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); + ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); + ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h); + ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h); + DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, + quant3, temp0_w, temp1_w, temp2_w, temp3_w); + SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); + PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1); + x0 = x0 ^ sign_z0; + x1 = x1 ^ sign_z1; + SUB2(x0, sign_z0, x1, sign_z1, x0, x1); + VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1); + ST_SH2(q_coeff0, q_coeff1, q_coeff, 8); + LD_SH2(de_quant, 8, de_quant0, de_quant1); + q_coeff0 *= de_quant0; + q_coeff1 *= de_quant1; + ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8); + + for (cnt = 0; cnt < 16; ++cnt) + { + if ((cnt <= 7) && (x1[7 - cnt] != 0)) + { + eob = (15 - cnt); + break; + } + + if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0)) + { + eob = (7 - (cnt - 8)); + break; + } + } + + return (int8_t)(eob + 1); +} + +static int8_t exact_regular_quantize_b_msa(int16_t *zbin_boost, + int16_t *coeff_ptr, + int16_t *zbin, + int16_t *round, + int16_t *quant, + int16_t *quant_shift, + int16_t *de_quant, + int16_t zbin_oq_in, + int16_t *q_coeff, + int16_t *dq_coeff) +{ + int32_t cnt, eob; + int16_t *boost_temp = zbin_boost; + v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, + 3, 8, 11, 13, 9, 10, 14, 15 }; + v8i16 round0, round1; + v8i16 sign_z0, sign_z1; + v8i16 q_coeff0, q_coeff1; + v8i16 z_bin0, z_bin1, zbin_o_q; + v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1; + v8i16 coeff0, coeff1, z0, z1; + v8i16 quant0, quant1, quant2, quant3; + v8i16 zero = { 0 }; + v8i16 inv_zig_zag0, inv_zig_zag1; + v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; + v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; + v8i16 temp0_h, temp1_h, temp2_h, temp3_h; + v4i32 temp0_w, temp1_w, temp2_w, temp3_w; + + ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1); + zbin_o_q = __msa_fill_h(zbin_oq_in); + eob = -1; + LD_SH2(coeff_ptr, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + z0, z1); + LD_SH2(round, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + round0, round1); + LD_SH2(quant, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + quant0, quant2); + LD_SH2(zbin, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + z_bin0, z_bin1); + sign_z0 = z0 >> 15; + sign_z1 = z1 >> 15; + x0 = __msa_add_a_h(z0, zero); + x1 = __msa_add_a_h(z1, zero); + SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); + SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); + ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); + ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); + ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h); + ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h); + DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, + quant3, temp0_w, temp1_w, temp2_w, temp3_w); + SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); + PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h); + LD_SH2(quant_shift, 8, coeff0, coeff1); + VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, + quant0, quant2); + ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); + ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); + ADD2(x0, round0, x1, round1, x0, x1); + ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h); + ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h); + DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, + quant3, temp0_w, temp1_w, temp2_w, temp3_w); + SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); + PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1); + sign_x0 = x0 ^ sign_z0; + sign_x1 = x1 ^ sign_z1; + SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); + for (cnt = 0; cnt < 16; ++cnt) + { + if (cnt <= 7) + { + if (boost_temp[0] <= z_bin0[cnt]) + { + if (x0[cnt]) + { + eob = cnt; + boost_temp = zbin_boost; + } + else + { + boost_temp++; + } + } + else + { + sign_x0[cnt] = 0; + boost_temp++; + } + } + else + { + if (boost_temp[0] <= z_bin1[cnt - 8]) + { + if (x1[cnt - 8]) + { + eob = cnt; + boost_temp = zbin_boost; + } + else + { + boost_temp++; + } + } + else + { + sign_x1[cnt - 8] = 0; + boost_temp++; + } + } + } + + VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1, + q_coeff0, q_coeff1); + ST_SH2(q_coeff0, q_coeff1, q_coeff, 8); + LD_SH2(de_quant, 8, de_quant0, de_quant1); + MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1); + ST_SH2(de_quant0, de_quant1, dq_coeff, 8); + + return (int8_t)(eob + 1); +} + +void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) +{ + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant_fast; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + + *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr, + dequant_ptr, qcoeff_ptr, dqcoeff_ptr); +} + +void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) +{ + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + int16_t zbin_oq_value = b->zbin_extra; + + *d->eob = exact_regular_quantize_b_msa(zbin_boost_ptr, coeff_ptr, + zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, + dequant_ptr, zbin_oq_value, + qcoeff_ptr, dqcoeff_ptr); +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 851155619..4be902b29 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -104,5 +104,6 @@ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm endif VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))