From 435308e02944d1b01c75f593d10569b0c23cb297 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Tue, 5 Jul 2016 20:00:43 +0530 Subject: [PATCH] Add MSA optimized encoder transform functions We add the following MSA optimized encoder transform functions: - ITransform - FTransform - FTransformWHT Change-Id: Ia6b17556aba5aff2d7a88208905fb45293d080a8 --- Android.mk | 1 + Makefile.vc | 1 + build.gradle | 1 + makefile.unix | 1 + src/dsp/Makefile.am | 14 +++- src/dsp/enc.c | 6 ++ src/dsp/enc_msa.c | 183 ++++++++++++++++++++++++++++++++++++++++++++ src/dsp/msa_macro.h | 115 ++++++++++++++++++++++++++++ 8 files changed, 318 insertions(+), 4 deletions(-) create mode 100644 src/dsp/enc_msa.c diff --git a/Android.mk b/Android.mk index 8aaa7544..f935512d 100644 --- a/Android.mk +++ b/Android.mk @@ -85,6 +85,7 @@ dsp_enc_srcs := \ src/dsp/enc_avx2.c \ src/dsp/enc_mips32.c \ src/dsp/enc_mips_dsp_r2.c \ + src/dsp/enc_msa.c \ src/dsp/enc_neon.$(NEON) \ src/dsp/enc_sse2.c \ src/dsp/enc_sse41.c \ diff --git a/Makefile.vc b/Makefile.vc index 502d4898..499ea583 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -233,6 +233,7 @@ DSP_ENC_OBJS = \ $(DIROBJ)\dsp\enc_avx2.obj \ $(DIROBJ)\dsp\enc_mips32.obj \ $(DIROBJ)\dsp\enc_mips_dsp_r2.obj \ + $(DIROBJ)\dsp\enc_msa.obj \ $(DIROBJ)\dsp\enc_neon.obj \ $(DIROBJ)\dsp\enc_sse2.obj \ $(DIROBJ)\dsp\enc_sse41.obj \ diff --git a/build.gradle b/build.gradle index 0358e033..3c0ff2b5 100644 --- a/build.gradle +++ b/build.gradle @@ -165,6 +165,7 @@ model { include "enc_avx2.c" include "enc_mips32.c" include "enc_mips_dsp_r2.c" + include "enc_msa.c" include "enc_neon.$NEON" include "enc_sse2.c" include "enc_sse41.c" diff --git a/makefile.unix b/makefile.unix index abb7649a..015d0b85 100644 --- a/makefile.unix +++ b/makefile.unix @@ -177,6 +177,7 @@ DSP_ENC_OBJS = \ src/dsp/enc_avx2.o \ src/dsp/enc_mips32.o \ src/dsp/enc_mips_dsp_r2.o \ + src/dsp/enc_msa.o \ src/dsp/enc_neon.o \ src/dsp/enc_sse2.o \ src/dsp/enc_sse41.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index b4152f5a..c85b8d88 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -2,7 +2,7 @@ noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la -noinst_LTLIBRARIES += libwebpdspdecode_msa.la +noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la if BUILD_LIBWEBPDECODER noinst_LTLIBRARIES += libwebpdspdecode.la @@ -86,8 +86,8 @@ libwebpdspdecode_msa_la_SOURCES += dec_msa.c libwebpdspdecode_msa_la_SOURCES += filters_msa.c libwebpdspdecode_msa_la_SOURCES += lossless_msa.c libwebpdspdecode_msa_la_SOURCES += msa_macro.h -libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) -libwebpdspdecode_msa_la_CFLAGS = $(AM_CFLAGS) +libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS) +libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS) libwebpdsp_sse2_la_SOURCES = libwebpdsp_sse2_la_SOURCES += argb_sse2.c @@ -112,6 +112,12 @@ libwebpdsp_neon_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdsp_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS) libwebpdsp_neon_la_LIBADD = libwebpdspdecode_neon.la +libwebpdsp_msa_la_SOURCES = +libwebpdsp_msa_la_SOURCES += enc_msa.c +libwebpdsp_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) +libwebpdsp_msa_la_CFLAGS = $(AM_CFLAGS) +libwebpdsp_msa_la_LIBADD = libwebpdspdecode_msa.la + libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES) noinst_HEADERS = @@ -126,7 +132,7 @@ libwebpdsp_la_LIBADD = libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la libwebpdsp_la_LIBADD += libwebpdsp_sse41.la libwebpdsp_la_LIBADD += libwebpdsp_neon.la -libwebpdsp_la_LIBADD += libwebpdspdecode_msa.la +libwebpdsp_la_LIBADD += libwebpdsp_msa.la if BUILD_LIBWEBPDECODER libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES) diff --git a/src/dsp/enc.c b/src/dsp/enc.c index f639f557..9c9a88b0 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -795,6 +795,7 @@ extern void VP8EncDspInitAVX2(void); extern void VP8EncDspInitNEON(void); extern void VP8EncDspInitMIPS32(void); extern void VP8EncDspInitMIPSdspR2(void); +extern void VP8EncDspInitMSA(void); static volatile VP8CPUInfo enc_last_cpuinfo_used = (VP8CPUInfo)&enc_last_cpuinfo_used; @@ -857,6 +858,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { if (VP8GetCPUInfo(kMIPSdspR2)) { VP8EncDspInitMIPSdspR2(); } +#endif +#if defined(WEBP_USE_MSA) + if (VP8GetCPUInfo(kMSA)) { + VP8EncDspInitMSA(); + } #endif } enc_last_cpuinfo_used = VP8GetCPUInfo; diff --git a/src/dsp/enc_msa.c b/src/dsp/enc_msa.c new file mode 100644 index 00000000..e4344600 --- /dev/null +++ b/src/dsp/enc_msa.c @@ -0,0 +1,183 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA version of encoder dsp functions. +// +// Author: Prashant Patil (prashant.patil@imgtec.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_MSA) + +#include "./msa_macro.h" + +//------------------------------------------------------------------------------ +// Transforms + +#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \ + v4i32 a1_m, b1_m, c1_m, d1_m; \ + const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \ + const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \ + v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \ + v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \ + v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \ + v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \ + \ + ADDSUB2(in0, in2, a1_m, b1_m); \ + SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \ + c_tmp2_m = c_tmp2_m + in3; \ + c1_m = c_tmp1_m - c_tmp2_m; \ + SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \ + d_tmp1_m = d_tmp1_m + in1; \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ +} while (0) + +static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, + uint8_t* dst) { + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v4i32 res0, res1, res2, res3; + v16i8 dest0, dest1, dest2, dest3; + const v16i8 zero = { 0 }; + + LD_SH2(in, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_SB4(ref, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + ITransformOne(ref, in, dst); + if (do_two) { + ITransformOne(ref + 4, in + 16, dst + 4); + } +} + +static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { + uint64_t out0, out1, out2, out3; + uint32_t in0, in1, in2, in3; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 t0, t1, t2, t3; + v16u8 srcl0, srcl1, src0, src1; + const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; + const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; + const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; + + LW4(src, BPS, in0, in1, in2, in3); + INSERT_W4_UB(in0, in1, in2, in3, src0); + LW4(ref, BPS, in0, in1, in2, in3); + INSERT_W4_UB(in0, in1, in2, in3, src1); + ILVRL_B2_UB(src0, src1, srcl0, srcl1); + HSUB_UB2_SH(srcl0, srcl1, t0, t1); + VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); + ADDSUB2(t2, t3, t0, t1); + t0 = SRLI_H(t0, 3); + VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); + tmp0 = __msa_hadd_s_w(t3, t3); + tmp2 = __msa_hsub_s_w(t3, t3); + FILL_W2_SW(1812, 937, tmp1, tmp3); + DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); + SRAI_W2_SW(tmp1, tmp3, 9); + PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); + VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); + ADDSUB2(t2, t3, t0, t1); + VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); + tmp0 = __msa_hadd_s_w(t3, t3); + tmp2 = __msa_hsub_s_w(t3, t3); + ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); + SRAI_W2_SW(tmp0, tmp2, 4); + FILL_W2_SW(12000, 51000, tmp1, tmp3); + DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); + SRAI_W2_SW(tmp1, tmp3, 16); + UNPCK_R_SH_SW(t1, tmp4); + tmp5 = __msa_ceqi_w(tmp4, 0); + tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); + tmp5 = __msa_fill_w(1); + tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); + tmp1 += tmp5; + PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); + out0 = __msa_copy_s_d((v2i64)t0, 0); + out1 = __msa_copy_s_d((v2i64)t0, 1); + out2 = __msa_copy_s_d((v2i64)t1, 0); + out3 = __msa_copy_s_d((v2i64)t1, 1); + SD4(out0, out1, out2, out3, out, 8); +} + +static void FTransformWHT(const int16_t* in, int16_t* out) { + v8i16 in0 = { 0 }; + v8i16 in1 = { 0 }; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + + in0 = __msa_insert_h(in0, 0, in[ 0]); + in0 = __msa_insert_h(in0, 1, in[ 64]); + in0 = __msa_insert_h(in0, 2, in[128]); + in0 = __msa_insert_h(in0, 3, in[192]); + in0 = __msa_insert_h(in0, 4, in[ 16]); + in0 = __msa_insert_h(in0, 5, in[ 80]); + in0 = __msa_insert_h(in0, 6, in[144]); + in0 = __msa_insert_h(in0, 7, in[208]); + in1 = __msa_insert_h(in1, 0, in[ 48]); + in1 = __msa_insert_h(in1, 1, in[112]); + in1 = __msa_insert_h(in1, 2, in[176]); + in1 = __msa_insert_h(in1, 3, in[240]); + in1 = __msa_insert_h(in1, 4, in[ 32]); + in1 = __msa_insert_h(in1, 5, in[ 96]); + in1 = __msa_insert_h(in1, 6, in[160]); + in1 = __msa_insert_h(in1, 7, in[224]); + ADDSUB2(in0, in1, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + ADDSUB2(tmp2, tmp3, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); + ADDSUB2(in0, in1, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + ADDSUB2(tmp2, tmp3, out0, out1); + SRAI_H2_SH(out0, out1, 1); + ST_SH2(out0, out1, out, 8); +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8EncDspInitMSA(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) { + VP8ITransform = ITransform; + VP8FTransform = FTransform; + VP8FTransformWHT = FTransformWHT; +} + +#else // !WEBP_USE_MSA + +WEBP_DSP_INIT_STUB(VP8EncDspInitMSA) + +#endif // WEBP_USE_MSA diff --git a/src/dsp/msa_macro.h b/src/dsp/msa_macro.h index 328aea21..c5c9d384 100644 --- a/src/dsp/msa_macro.h +++ b/src/dsp/msa_macro.h @@ -23,14 +23,18 @@ #ifdef CLANG_BUILD #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) + #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b) #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b) #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b) #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b) + #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) #else #define ADDVI_H(a, b) (a + b) + #define ADDVI_W(a, b) (a + b) #define SRAI_B(a, b) (a >> b) #define SRAI_H(a, b) (a >> b) #define SRAI_W(a, b) (a >> b) + #define SRLI_H(a, b) (a << b) #endif #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) @@ -257,6 +261,18 @@ } while (0) #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) +/* Description : Store vectors of 8 halfword elements with stride + * Arguments : Inputs - in0, in1, pdst, stride + * Details : Store 8 halfword elements from 'in0' to (pdst) + * Store 8 halfword elements from 'in1' to (pdst + stride) + */ +#define ST_H2(RTYPE, in0, in1, pdst, stride) do { \ + ST_H(RTYPE, in0, pdst); \ + ST_H(RTYPE, in1, pdst + stride); \ +} while (0) +#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + /* Description : Store 2x4 byte block to destination memory from input vector * Arguments : Inputs - in, stidx, pdst, stride * Details : Index 'stidx' halfword element from 'in' vector is copied to @@ -377,6 +393,22 @@ } while (0) #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) +/* Description : Dot product & addition of halfword vector elements + * Arguments : Inputs - mult0, mult1, cnst0, cnst1 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Signed halfword elements from 'mult0' are multiplied with + * signed halfword elements from 'cnst0' producing a result + * twice the size of input i.e. signed word. + * The multiplication result of adjacent odd-even elements + * are added to the 'out0' vector + */ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ +} while (0) +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + /* Description : Clips all signed halfword elements of input vector * between 0 & 255 * Arguments : Input/output - val @@ -434,6 +466,22 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { } #define HADD_UH_U32(in) func_hadd_uh_u32(in) +/* Description : Horizontal subtraction of unsigned byte vector elements + * Arguments : Inputs - in0, in1 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Each unsigned odd byte element from 'in0' is subtracted from + * even unsigned byte element from 'in0' (pairwise) and the + * halfword result is written to 'out0' + */ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) do { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ +} while (0) +#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__) +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) +#define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__) + /* Description : Set element n input vector to GPR value * Arguments : Inputs - in0, in1, in2, in3 * Output - out @@ -745,6 +793,23 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__) +/* Description : Pack even halfword elements of vector pairs + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Even halfword elements of 'in0' are copied to the left half of + * 'out0' & even halfword elements of 'in1' are copied to the + * right half of 'out0'. + */ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ +} while (0) +#define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__) +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) +#define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__) + /* Description : Arithmetic immediate shift right all elements of word vector * Arguments : Inputs - in0, in1, shift * Outputs - in place operation @@ -814,6 +879,30 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__) #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__) +/* Description : Addition of 2 pairs of word vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \ + out0 = (RTYPE)ADDVI_W(in0, in1); \ + out1 = (RTYPE)ADDVI_W(in2, in3); \ +} while (0) +#define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__) + +/* Description : Fill 2 pairs of word vectors with GP registers + * Arguments : Inputs - in0, in1 + * Outputs - out0, out1 + * Details : GP register in0 is replicated in each word element of out0 + * GP register in1 is replicated in each word element of out1 + */ +#define FILL_W2(RTYPE, in0, in1, out0, out1) do { \ + out0 = (RTYPE)__msa_fill_w(in0); \ + out1 = (RTYPE)__msa_fill_w(in1); \ +} while (0) +#define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__) + /* Description : Addition of 2 pairs of vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1 @@ -842,6 +931,32 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { out1 = in2 - in3; \ } while (0) +/* Description : Addition - Subtraction of input vectors + * Arguments : Inputs - in0, in1 + * Outputs - out0, out1 + * Details : Each element in 'in1' is added to 'in0' and result is + * written to 'out0'. + * Each element in 'in1' is subtracted from 'in0' and result is + * written to 'out1'. + */ +#define ADDSUB2(in0, in1, out0, out1) do { \ + out0 = in0 + in1; \ + out1 = in0 - in1; \ +} while (0) + +/* Description : Sign extend halfword elements from right half of the vector + * Arguments : Input - in (halfword vector) + * Output - out (sign extended word vector) + * Return Type - signed word + * Details : Sign bit of halfword elements from input vector 'in' is + * extracted and interleaved with same vector 'in0' to generate + * 4 word elements keeping sign intact + */ +#define UNPCK_R_SH_SW(in, out) do { \ + const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ +} while (0) + /* Description : Sign extend halfword elements from input vector and return * the result in pair of vectors * Arguments : Input - in (halfword vector)