Add MSA optimized encoder transform functions
We add the following MSA optimized encoder transform functions: - ITransform - FTransform - FTransformWHT Change-Id: Ia6b17556aba5aff2d7a88208905fb45293d080a8
This commit is contained in:
parent
dce64bfa1b
commit
435308e029
@ -85,6 +85,7 @@ dsp_enc_srcs := \
|
||||
src/dsp/enc_avx2.c \
|
||||
src/dsp/enc_mips32.c \
|
||||
src/dsp/enc_mips_dsp_r2.c \
|
||||
src/dsp/enc_msa.c \
|
||||
src/dsp/enc_neon.$(NEON) \
|
||||
src/dsp/enc_sse2.c \
|
||||
src/dsp/enc_sse41.c \
|
||||
|
@ -233,6 +233,7 @@ DSP_ENC_OBJS = \
|
||||
$(DIROBJ)\dsp\enc_avx2.obj \
|
||||
$(DIROBJ)\dsp\enc_mips32.obj \
|
||||
$(DIROBJ)\dsp\enc_mips_dsp_r2.obj \
|
||||
$(DIROBJ)\dsp\enc_msa.obj \
|
||||
$(DIROBJ)\dsp\enc_neon.obj \
|
||||
$(DIROBJ)\dsp\enc_sse2.obj \
|
||||
$(DIROBJ)\dsp\enc_sse41.obj \
|
||||
|
@ -165,6 +165,7 @@ model {
|
||||
include "enc_avx2.c"
|
||||
include "enc_mips32.c"
|
||||
include "enc_mips_dsp_r2.c"
|
||||
include "enc_msa.c"
|
||||
include "enc_neon.$NEON"
|
||||
include "enc_sse2.c"
|
||||
include "enc_sse41.c"
|
||||
|
@ -177,6 +177,7 @@ DSP_ENC_OBJS = \
|
||||
src/dsp/enc_avx2.o \
|
||||
src/dsp/enc_mips32.o \
|
||||
src/dsp/enc_mips_dsp_r2.o \
|
||||
src/dsp/enc_msa.o \
|
||||
src/dsp/enc_neon.o \
|
||||
src/dsp/enc_sse2.o \
|
||||
src/dsp/enc_sse41.o \
|
||||
|
@ -2,7 +2,7 @@ noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
|
||||
noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
|
||||
noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
|
||||
noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
|
||||
noinst_LTLIBRARIES += libwebpdspdecode_msa.la
|
||||
noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
|
||||
|
||||
if BUILD_LIBWEBPDECODER
|
||||
noinst_LTLIBRARIES += libwebpdspdecode.la
|
||||
@ -86,8 +86,8 @@ libwebpdspdecode_msa_la_SOURCES += dec_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += filters_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += lossless_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += msa_macro.h
|
||||
libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
|
||||
libwebpdspdecode_msa_la_CFLAGS = $(AM_CFLAGS)
|
||||
libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
|
||||
libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)
|
||||
|
||||
libwebpdsp_sse2_la_SOURCES =
|
||||
libwebpdsp_sse2_la_SOURCES += argb_sse2.c
|
||||
@ -112,6 +112,12 @@ libwebpdsp_neon_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
|
||||
libwebpdsp_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
|
||||
libwebpdsp_neon_la_LIBADD = libwebpdspdecode_neon.la
|
||||
|
||||
libwebpdsp_msa_la_SOURCES =
|
||||
libwebpdsp_msa_la_SOURCES += enc_msa.c
|
||||
libwebpdsp_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
|
||||
libwebpdsp_msa_la_CFLAGS = $(AM_CFLAGS)
|
||||
libwebpdsp_msa_la_LIBADD = libwebpdspdecode_msa.la
|
||||
|
||||
libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)
|
||||
|
||||
noinst_HEADERS =
|
||||
@ -126,7 +132,7 @@ libwebpdsp_la_LIBADD =
|
||||
libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
|
||||
libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
|
||||
libwebpdsp_la_LIBADD += libwebpdsp_neon.la
|
||||
libwebpdsp_la_LIBADD += libwebpdspdecode_msa.la
|
||||
libwebpdsp_la_LIBADD += libwebpdsp_msa.la
|
||||
|
||||
if BUILD_LIBWEBPDECODER
|
||||
libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
|
||||
|
@ -795,6 +795,7 @@ extern void VP8EncDspInitAVX2(void);
|
||||
extern void VP8EncDspInitNEON(void);
|
||||
extern void VP8EncDspInitMIPS32(void);
|
||||
extern void VP8EncDspInitMIPSdspR2(void);
|
||||
extern void VP8EncDspInitMSA(void);
|
||||
|
||||
static volatile VP8CPUInfo enc_last_cpuinfo_used =
|
||||
(VP8CPUInfo)&enc_last_cpuinfo_used;
|
||||
@ -857,6 +858,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
VP8EncDspInitMIPSdspR2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MSA)
|
||||
if (VP8GetCPUInfo(kMSA)) {
|
||||
VP8EncDspInitMSA();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
enc_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
|
183
src/dsp/enc_msa.c
Normal file
183
src/dsp/enc_msa.c
Normal file
@ -0,0 +1,183 @@
|
||||
// Copyright 2016 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// MSA version of encoder dsp functions.
|
||||
//
|
||||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include "./msa_macro.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms
|
||||
|
||||
#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
|
||||
v4i32 a1_m, b1_m, c1_m, d1_m; \
|
||||
const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
|
||||
const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
|
||||
v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
|
||||
v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
|
||||
v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
|
||||
v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
|
||||
\
|
||||
ADDSUB2(in0, in2, a1_m, b1_m); \
|
||||
SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
|
||||
c_tmp2_m = c_tmp2_m + in3; \
|
||||
c1_m = c_tmp1_m - c_tmp2_m; \
|
||||
SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
|
||||
d_tmp1_m = d_tmp1_m + in1; \
|
||||
d1_m = d_tmp1_m + d_tmp2_m; \
|
||||
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
||||
uint8_t* dst) {
|
||||
v8i16 input0, input1;
|
||||
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
|
||||
v4i32 res0, res1, res2, res3;
|
||||
v16i8 dest0, dest1, dest2, dest3;
|
||||
const v16i8 zero = { 0 };
|
||||
|
||||
LD_SH2(in, 8, input0, input1);
|
||||
UNPCK_SH_SW(input0, in0, in1);
|
||||
UNPCK_SH_SW(input1, in2, in3);
|
||||
IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
|
||||
TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
|
||||
IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
|
||||
SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
|
||||
TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
|
||||
LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
|
||||
ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
|
||||
res0, res1, res2, res3);
|
||||
ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
|
||||
res0, res1, res2, res3);
|
||||
ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
|
||||
CLIP_SW4_0_255(res0, res1, res2, res3);
|
||||
PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
|
||||
res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
|
||||
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
|
||||
}
|
||||
|
||||
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
ITransformOne(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
uint64_t out0, out1, out2, out3;
|
||||
uint32_t in0, in1, in2, in3;
|
||||
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
|
||||
v8i16 t0, t1, t2, t3;
|
||||
v16u8 srcl0, srcl1, src0, src1;
|
||||
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
|
||||
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
|
||||
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
|
||||
const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
|
||||
const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
|
||||
const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
|
||||
|
||||
LW4(src, BPS, in0, in1, in2, in3);
|
||||
INSERT_W4_UB(in0, in1, in2, in3, src0);
|
||||
LW4(ref, BPS, in0, in1, in2, in3);
|
||||
INSERT_W4_UB(in0, in1, in2, in3, src1);
|
||||
ILVRL_B2_UB(src0, src1, srcl0, srcl1);
|
||||
HSUB_UB2_SH(srcl0, srcl1, t0, t1);
|
||||
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
|
||||
ADDSUB2(t2, t3, t0, t1);
|
||||
t0 = SRLI_H(t0, 3);
|
||||
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
|
||||
tmp0 = __msa_hadd_s_w(t3, t3);
|
||||
tmp2 = __msa_hsub_s_w(t3, t3);
|
||||
FILL_W2_SW(1812, 937, tmp1, tmp3);
|
||||
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
|
||||
SRAI_W2_SW(tmp1, tmp3, 9);
|
||||
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
|
||||
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
|
||||
ADDSUB2(t2, t3, t0, t1);
|
||||
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
|
||||
tmp0 = __msa_hadd_s_w(t3, t3);
|
||||
tmp2 = __msa_hsub_s_w(t3, t3);
|
||||
ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
|
||||
SRAI_W2_SW(tmp0, tmp2, 4);
|
||||
FILL_W2_SW(12000, 51000, tmp1, tmp3);
|
||||
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
|
||||
SRAI_W2_SW(tmp1, tmp3, 16);
|
||||
UNPCK_R_SH_SW(t1, tmp4);
|
||||
tmp5 = __msa_ceqi_w(tmp4, 0);
|
||||
tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
|
||||
tmp5 = __msa_fill_w(1);
|
||||
tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
|
||||
tmp1 += tmp5;
|
||||
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
|
||||
out0 = __msa_copy_s_d((v2i64)t0, 0);
|
||||
out1 = __msa_copy_s_d((v2i64)t0, 1);
|
||||
out2 = __msa_copy_s_d((v2i64)t1, 0);
|
||||
out3 = __msa_copy_s_d((v2i64)t1, 1);
|
||||
SD4(out0, out1, out2, out3, out, 8);
|
||||
}
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
v8i16 in0 = { 0 };
|
||||
v8i16 in1 = { 0 };
|
||||
v8i16 tmp0, tmp1, tmp2, tmp3;
|
||||
v8i16 out0, out1;
|
||||
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
|
||||
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
|
||||
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
|
||||
|
||||
in0 = __msa_insert_h(in0, 0, in[ 0]);
|
||||
in0 = __msa_insert_h(in0, 1, in[ 64]);
|
||||
in0 = __msa_insert_h(in0, 2, in[128]);
|
||||
in0 = __msa_insert_h(in0, 3, in[192]);
|
||||
in0 = __msa_insert_h(in0, 4, in[ 16]);
|
||||
in0 = __msa_insert_h(in0, 5, in[ 80]);
|
||||
in0 = __msa_insert_h(in0, 6, in[144]);
|
||||
in0 = __msa_insert_h(in0, 7, in[208]);
|
||||
in1 = __msa_insert_h(in1, 0, in[ 48]);
|
||||
in1 = __msa_insert_h(in1, 1, in[112]);
|
||||
in1 = __msa_insert_h(in1, 2, in[176]);
|
||||
in1 = __msa_insert_h(in1, 3, in[240]);
|
||||
in1 = __msa_insert_h(in1, 4, in[ 32]);
|
||||
in1 = __msa_insert_h(in1, 5, in[ 96]);
|
||||
in1 = __msa_insert_h(in1, 6, in[160]);
|
||||
in1 = __msa_insert_h(in1, 7, in[224]);
|
||||
ADDSUB2(in0, in1, tmp0, tmp1);
|
||||
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
|
||||
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
|
||||
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
|
||||
ADDSUB2(in0, in1, tmp0, tmp1);
|
||||
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
|
||||
ADDSUB2(tmp2, tmp3, out0, out1);
|
||||
SRAI_H2_SH(out0, out1, 1);
|
||||
ST_SH2(out0, out1, out, 8);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void VP8EncDspInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
|
||||
|
||||
#endif // WEBP_USE_MSA
|
@ -23,14 +23,18 @@
|
||||
|
||||
#ifdef CLANG_BUILD
|
||||
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
|
||||
#define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
|
||||
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
|
||||
#define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)
|
||||
#define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)
|
||||
#define SRLI_H(a, b) __msa_srli_h((v8i16)a, b)
|
||||
#else
|
||||
#define ADDVI_H(a, b) (a + b)
|
||||
#define ADDVI_W(a, b) (a + b)
|
||||
#define SRAI_B(a, b) (a >> b)
|
||||
#define SRAI_H(a, b) (a >> b)
|
||||
#define SRAI_W(a, b) (a >> b)
|
||||
#define SRLI_H(a, b) (a << b)
|
||||
#endif
|
||||
|
||||
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
|
||||
@ -257,6 +261,18 @@
|
||||
} while (0)
|
||||
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Store vectors of 8 halfword elements with stride
|
||||
* Arguments : Inputs - in0, in1, pdst, stride
|
||||
* Details : Store 8 halfword elements from 'in0' to (pdst)
|
||||
* Store 8 halfword elements from 'in1' to (pdst + stride)
|
||||
*/
|
||||
#define ST_H2(RTYPE, in0, in1, pdst, stride) do { \
|
||||
ST_H(RTYPE, in0, pdst); \
|
||||
ST_H(RTYPE, in1, pdst + stride); \
|
||||
} while (0)
|
||||
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
|
||||
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Store 2x4 byte block to destination memory from input vector
|
||||
* Arguments : Inputs - in, stidx, pdst, stride
|
||||
* Details : Index 'stidx' halfword element from 'in' vector is copied to
|
||||
@ -377,6 +393,22 @@
|
||||
} while (0)
|
||||
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product & addition of halfword vector elements
|
||||
* Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Signed halfword elements from 'mult0' are multiplied with
|
||||
* signed halfword elements from 'cnst0' producing a result
|
||||
* twice the size of input i.e. signed word.
|
||||
* The multiplication result of adjacent odd-even elements
|
||||
* are added to the 'out0' vector
|
||||
*/
|
||||
#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
|
||||
out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
|
||||
out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
|
||||
} while (0)
|
||||
#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Clips all signed halfword elements of input vector
|
||||
* between 0 & 255
|
||||
* Arguments : Input/output - val
|
||||
@ -434,6 +466,22 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
}
|
||||
#define HADD_UH_U32(in) func_hadd_uh_u32(in)
|
||||
|
||||
/* Description : Horizontal subtraction of unsigned byte vector elements
|
||||
* Arguments : Inputs - in0, in1
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Each unsigned odd byte element from 'in0' is subtracted from
|
||||
* even unsigned byte element from 'in0' (pairwise) and the
|
||||
* halfword result is written to 'out0'
|
||||
*/
|
||||
#define HSUB_UB2(RTYPE, in0, in1, out0, out1) do { \
|
||||
out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
|
||||
out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
|
||||
} while (0)
|
||||
#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
|
||||
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
|
||||
#define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Set element n input vector to GPR value
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Output - out
|
||||
@ -745,6 +793,23 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
|
||||
#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Pack even halfword elements of vector pairs
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even halfword elements of 'in0' are copied to the left half of
|
||||
* 'out0' & even halfword elements of 'in1' are copied to the
|
||||
* right half of 'out0'.
|
||||
*/
|
||||
#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
|
||||
out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
|
||||
out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
|
||||
} while (0)
|
||||
#define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
|
||||
#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
|
||||
#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
|
||||
#define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
|
||||
|
||||
/* Description : Arithmetic immediate shift right all elements of word vector
|
||||
* Arguments : Inputs - in0, in1, shift
|
||||
* Outputs - in place operation
|
||||
@ -814,6 +879,30 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
#define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
|
||||
#define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Addition of 2 pairs of word vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Details : Each element in 'in0' is added to 'in1' and result is written
|
||||
* to 'out0'.
|
||||
*/
|
||||
#define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
|
||||
out0 = (RTYPE)ADDVI_W(in0, in1); \
|
||||
out1 = (RTYPE)ADDVI_W(in2, in3); \
|
||||
} while (0)
|
||||
#define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Fill 2 pairs of word vectors with GP registers
|
||||
* Arguments : Inputs - in0, in1
|
||||
* Outputs - out0, out1
|
||||
* Details : GP register in0 is replicated in each word element of out0
|
||||
* GP register in1 is replicated in each word element of out1
|
||||
*/
|
||||
#define FILL_W2(RTYPE, in0, in1, out0, out1) do { \
|
||||
out0 = (RTYPE)__msa_fill_w(in0); \
|
||||
out1 = (RTYPE)__msa_fill_w(in1); \
|
||||
} while (0)
|
||||
#define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Addition of 2 pairs of vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
@ -842,6 +931,32 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
out1 = in2 - in3; \
|
||||
} while (0)
|
||||
|
||||
/* Description : Addition - Subtraction of input vectors
|
||||
* Arguments : Inputs - in0, in1
|
||||
* Outputs - out0, out1
|
||||
* Details : Each element in 'in1' is added to 'in0' and result is
|
||||
* written to 'out0'.
|
||||
* Each element in 'in1' is subtracted from 'in0' and result is
|
||||
* written to 'out1'.
|
||||
*/
|
||||
#define ADDSUB2(in0, in1, out0, out1) do { \
|
||||
out0 = in0 + in1; \
|
||||
out1 = in0 - in1; \
|
||||
} while (0)
|
||||
|
||||
/* Description : Sign extend halfword elements from right half of the vector
|
||||
* Arguments : Input - in (halfword vector)
|
||||
* Output - out (sign extended word vector)
|
||||
* Return Type - signed word
|
||||
* Details : Sign bit of halfword elements from input vector 'in' is
|
||||
* extracted and interleaved with same vector 'in0' to generate
|
||||
* 4 word elements keeping sign intact
|
||||
*/
|
||||
#define UNPCK_R_SH_SW(in, out) do { \
|
||||
const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0); \
|
||||
out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
|
||||
} while (0)
|
||||
|
||||
/* Description : Sign extend halfword elements from input vector and return
|
||||
* the result in pair of vectors
|
||||
* Arguments : Input - in (halfword vector)
|
||||
|
Loading…
Reference in New Issue
Block a user