Add MSA optimized rescaling functions
We add the following MSA optimized rescaling functions: - RescalerExportRowExpand - RescalerExportRowShrink Change-Id: Ic1c76065423b02617db94cf0c22bb564219b36e6
This commit is contained in:
parent
cb19dbc1a4
commit
9ac74f922e
@ -65,6 +65,7 @@ dsp_dec_srcs := \
|
||||
src/dsp/rescaler.c \
|
||||
src/dsp/rescaler_mips32.c \
|
||||
src/dsp/rescaler_mips_dsp_r2.c \
|
||||
src/dsp/rescaler_msa.c \
|
||||
src/dsp/rescaler_neon.$(NEON) \
|
||||
src/dsp/rescaler_sse2.c \
|
||||
src/dsp/upsampling.c \
|
||||
|
@ -210,6 +210,7 @@ DSP_DEC_OBJS = \
|
||||
$(DIROBJ)\dsp\rescaler.obj \
|
||||
$(DIROBJ)\dsp\rescaler_mips32.obj \
|
||||
$(DIROBJ)\dsp\rescaler_mips_dsp_r2.obj \
|
||||
$(DIROBJ)\dsp\rescaler_msa.obj \
|
||||
$(DIROBJ)\dsp\rescaler_neon.obj \
|
||||
$(DIROBJ)\dsp\rescaler_sse2.obj \
|
||||
$(DIROBJ)\dsp\upsampling.obj \
|
||||
|
@ -135,6 +135,7 @@ model {
|
||||
include "rescaler.c"
|
||||
include "rescaler_mips32.c"
|
||||
include "rescaler_mips_dsp_r2.c"
|
||||
include "rescaler_msa.c"
|
||||
include "rescaler_neon.$NEON"
|
||||
include "rescaler_sse2.c"
|
||||
include "upsampling.c"
|
||||
|
@ -154,6 +154,7 @@ DSP_DEC_OBJS = \
|
||||
src/dsp/rescaler.o \
|
||||
src/dsp/rescaler_mips32.o \
|
||||
src/dsp/rescaler_mips_dsp_r2.o \
|
||||
src/dsp/rescaler_msa.o \
|
||||
src/dsp/rescaler_neon.o \
|
||||
src/dsp/rescaler_sse2.o \
|
||||
src/dsp/upsampling.o \
|
||||
|
@ -85,6 +85,7 @@ libwebpdspdecode_msa_la_SOURCES =
|
||||
libwebpdspdecode_msa_la_SOURCES += dec_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += filters_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += lossless_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += rescaler_msa.c
|
||||
libwebpdspdecode_msa_la_SOURCES += msa_macro.h
|
||||
libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
|
||||
libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)
|
||||
|
@ -150,7 +150,7 @@
|
||||
out3 = LW(ptmp); \
|
||||
} while (0)
|
||||
|
||||
/* Description : Store 4 words with stride
|
||||
/* Description : Store words with stride
|
||||
* Arguments : Inputs - in0, in1, in2, in3, pdst, stride
|
||||
* Details : Store word from 'in0' to (pdst)
|
||||
* Store word from 'in1' to (pdst + stride)
|
||||
@ -168,6 +168,22 @@
|
||||
SW(in3, ptmp); \
|
||||
} while (0)
|
||||
|
||||
#define SW3(in0, in1, in2, pdst, stride) do { \
|
||||
uint8_t* ptmp = (uint8_t*)pdst; \
|
||||
SW(in0, ptmp); \
|
||||
ptmp += stride; \
|
||||
SW(in1, ptmp); \
|
||||
ptmp += stride; \
|
||||
SW(in2, ptmp); \
|
||||
} while (0)
|
||||
|
||||
#define SW2(in0, in1, pdst, stride) do { \
|
||||
uint8_t* ptmp = (uint8_t*)pdst; \
|
||||
SW(in0, ptmp); \
|
||||
ptmp += stride; \
|
||||
SW(in1, ptmp); \
|
||||
} while (0)
|
||||
|
||||
/* Description : Store 4 double words with stride
|
||||
* Arguments : Inputs - in0, in1, in2, in3, pdst, stride
|
||||
* Details : Store double word from 'in0' to (pdst)
|
||||
@ -237,9 +253,11 @@
|
||||
|
||||
/* Description : Load vectors with 4 word elements with stride
|
||||
* Arguments : Inputs - psrc, stride
|
||||
* Outputs - out0, out1
|
||||
* Details : Load 4 word elements in 'out0' from (psrc)
|
||||
* Load 4 word elements in 'out1' from (psrc + stride)
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Details : Load 4 word elements in 'out0' from (psrc + 0 * stride)
|
||||
* Load 4 word elements in 'out1' from (psrc + 1 * stride)
|
||||
* Load 4 word elements in 'out2' from (psrc + 2 * stride)
|
||||
* Load 4 word elements in 'out3' from (psrc + 3 * stride)
|
||||
*/
|
||||
#define LD_W2(RTYPE, psrc, stride, out0, out1) do { \
|
||||
out0 = LD_W(RTYPE, psrc); \
|
||||
@ -248,6 +266,13 @@
|
||||
#define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
|
||||
#define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
#define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do { \
|
||||
LD_W2(RTYPE, psrc, stride, out0, out1); \
|
||||
out2 = LD_W(RTYPE, psrc + 2 * stride); \
|
||||
} while (0)
|
||||
#define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
|
||||
#define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
|
||||
|
||||
#define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \
|
||||
LD_W2(RTYPE, psrc, stride, out0, out1); \
|
||||
LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3); \
|
||||
@ -281,6 +306,34 @@
|
||||
} while (0)
|
||||
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Store vectors of 4 word elements with stride
|
||||
* Arguments : Inputs - in0, in1, in2, in3, pdst, stride
|
||||
* Details : Store 4 word elements from 'in0' to (pdst + 0 * stride)
|
||||
* Store 4 word elements from 'in1' to (pdst + 1 * stride)
|
||||
* Store 4 word elements from 'in2' to (pdst + 2 * stride)
|
||||
* Store 4 word elements from 'in3' to (pdst + 3 * stride)
|
||||
*/
|
||||
#define ST_W2(RTYPE, in0, in1, pdst, stride) do { \
|
||||
ST_W(RTYPE, in0, pdst); \
|
||||
ST_W(RTYPE, in1, pdst + stride); \
|
||||
} while (0)
|
||||
#define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
|
||||
#define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
#define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do { \
|
||||
ST_W2(RTYPE, in0, in1, pdst, stride); \
|
||||
ST_W(RTYPE, in2, pdst + 2 * stride); \
|
||||
} while (0)
|
||||
#define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
|
||||
#define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
|
||||
|
||||
#define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \
|
||||
ST_W2(RTYPE, in0, in1, pdst, stride); \
|
||||
ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
|
||||
} while (0)
|
||||
#define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
|
||||
#define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Store vectors of 8 halfword elements with stride
|
||||
* Arguments : Inputs - in0, in1, pdst, stride
|
||||
* Details : Store 8 halfword elements from 'in0' to (pdst)
|
||||
@ -429,6 +482,22 @@
|
||||
} while (0)
|
||||
#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product of unsigned word vector elements
|
||||
* Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Unsigned word elements from 'mult0' are multiplied with
|
||||
* unsigned word elements from 'cnst0' producing a result
|
||||
* twice the size of input i.e. unsigned double word.
|
||||
* The multiplication result of adjacent odd-even elements
|
||||
* are added together and written to the 'out0' vector
|
||||
*/
|
||||
#define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
|
||||
out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0); \
|
||||
out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1); \
|
||||
} while (0)
|
||||
#define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product & addition of halfword vector elements
|
||||
* Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||
* Outputs - out0, out1
|
||||
@ -868,6 +937,7 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
|
||||
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
|
||||
#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
|
||||
|
||||
/* Description : Pack even byte elements of vector pairs
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
@ -913,6 +983,23 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
|
||||
#define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
|
||||
|
||||
/* Description : Pack even word elements of vector pairs
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even word elements of 'in0' are copied to the left half of
|
||||
* 'out0' & even word elements of 'in1' are copied to the
|
||||
* right half of 'out0'.
|
||||
*/
|
||||
#define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
|
||||
out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1); \
|
||||
out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3); \
|
||||
} while (0)
|
||||
#define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
|
||||
#define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
|
||||
#define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
|
||||
#define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
|
||||
|
||||
/* Description : Arithmetic immediate shift right all elements of word vector
|
||||
* Arguments : Inputs - in0, in1, shift
|
||||
* Outputs - in place operation
|
||||
@ -969,6 +1056,31 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
#define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
|
||||
#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Shift right arithmetic rounded double words
|
||||
* Arguments : Inputs - in0, in1, shift
|
||||
* Outputs - in place operation
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Each element of vector 'in0' is shifted right arithmetically by
|
||||
* the number of bits in the corresponding element in the vector
|
||||
* 'shift'. The last discarded bit is added to shifted value for
|
||||
* rounding and the result is written in-place.
|
||||
* 'shift' is a vector.
|
||||
*/
|
||||
#define SRAR_D2(RTYPE, in0, in1, shift) do { \
|
||||
in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift); \
|
||||
in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift); \
|
||||
} while (0)
|
||||
#define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
|
||||
#define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
|
||||
#define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
|
||||
|
||||
#define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do { \
|
||||
SRAR_D2(RTYPE, in0, in1, shift); \
|
||||
SRAR_D2(RTYPE, in2, in3, shift); \
|
||||
} while (0)
|
||||
#define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
|
||||
#define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
|
||||
|
||||
/* Description : Addition of 2 pairs of half-word vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
@ -1034,6 +1146,20 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||
out1 = in2 - in3; \
|
||||
} while (0)
|
||||
|
||||
#define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do { \
|
||||
out0 = in0 - in1; \
|
||||
out1 = in2 - in3; \
|
||||
out2 = in4 - in5; \
|
||||
} while (0)
|
||||
|
||||
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) do { \
|
||||
out0 = in0 - in1; \
|
||||
out1 = in2 - in3; \
|
||||
out2 = in4 - in5; \
|
||||
out3 = in6 - in7; \
|
||||
} while (0)
|
||||
|
||||
/* Description : Addition - Subtraction of input vectors
|
||||
* Arguments : Inputs - in0, in1
|
||||
* Outputs - out0, out1
|
||||
|
@ -199,6 +199,7 @@ WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
|
||||
extern void WebPRescalerDspInitSSE2(void);
|
||||
extern void WebPRescalerDspInitMIPS32(void);
|
||||
extern void WebPRescalerDspInitMIPSdspR2(void);
|
||||
extern void WebPRescalerDspInitMSA(void);
|
||||
extern void WebPRescalerDspInitNEON(void);
|
||||
|
||||
static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
|
||||
@ -232,6 +233,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
WebPRescalerDspInitMIPSdspR2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MSA)
|
||||
if (VP8GetCPUInfo(kMSA)) {
|
||||
WebPRescalerDspInitMSA();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
|
444
src/dsp/rescaler_msa.c
Normal file
444
src/dsp/rescaler_msa.c
Normal file
@ -0,0 +1,444 @@
|
||||
// Copyright 2016 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// MSA version of rescaling functions
|
||||
//
|
||||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "../utils/rescaler.h"
|
||||
#include "./msa_macro.h"
|
||||
|
||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
||||
#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
|
||||
v4u32 tmp0, tmp1, tmp2, tmp3; \
|
||||
v16u8 t0, t1, t2, t3, t4, t5; \
|
||||
v2u64 out0, out1, out2, out3; \
|
||||
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
|
||||
ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
|
||||
DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
|
||||
SRAR_D4_UD(out0, out1, out2, out3, shift); \
|
||||
PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
|
||||
ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
|
||||
ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
|
||||
DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
|
||||
SRAR_D4_UD(out0, out1, out2, out3, shift); \
|
||||
PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
|
||||
PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
|
||||
dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
|
||||
} while (0)
|
||||
|
||||
#define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
|
||||
v4u32 tmp0, tmp1; \
|
||||
v16i8 t0, t1; \
|
||||
v2u64 out0, out1; \
|
||||
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
|
||||
SRAR_D2_UD(out0, out1, shift); \
|
||||
t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
|
||||
t1 = __msa_pckev_b(t0, t0); \
|
||||
t0 = __msa_pckev_b(t1, t1); \
|
||||
dst = __msa_copy_s_w((v4i32)t0, 0); \
|
||||
} while (0)
|
||||
|
||||
#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
|
||||
dst0, dst1, dst2, dst3) do { \
|
||||
v4u32 tmp0, tmp1, tmp2, tmp3; \
|
||||
v2u64 out0, out1, out2, out3; \
|
||||
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
|
||||
ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
|
||||
DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
|
||||
SRAR_D4_UD(out0, out1, out2, out3, shift); \
|
||||
PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
|
||||
ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
|
||||
ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
|
||||
DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
|
||||
SRAR_D4_UD(out0, out1, out2, out3, shift); \
|
||||
PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
|
||||
} while (0)
|
||||
|
||||
#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
|
||||
v4u32 tmp0, tmp1; \
|
||||
v2u64 out0, out1; \
|
||||
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
|
||||
SRAR_D2_UD(out0, out1, shift); \
|
||||
dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
|
||||
} while (0)
|
||||
|
||||
#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
|
||||
dst0, dst1) do { \
|
||||
v4u32 tmp0, tmp1, tmp2, tmp3; \
|
||||
v2u64 out0, out1, out2, out3; \
|
||||
ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
|
||||
ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
|
||||
DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
|
||||
SRAR_D4_UD(out0, out1, out2, out3, shift); \
|
||||
DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
|
||||
DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
|
||||
SRAR_D4_UD(out0, out1, out2, out3, shift); \
|
||||
PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
|
||||
} while (0)
|
||||
|
||||
#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
|
||||
v4u32 tmp0, tmp1; \
|
||||
v2u64 out0, out1; \
|
||||
v16i8 t0, t1; \
|
||||
ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
|
||||
DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
|
||||
SRAR_D2_UD(out0, out1, shift); \
|
||||
DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
|
||||
SRAR_D2_UD(out0, out1, shift); \
|
||||
t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
|
||||
t1 = __msa_pckev_b(t0, t0); \
|
||||
t0 = __msa_pckev_b(t1, t1); \
|
||||
dst = __msa_copy_s_w((v4i32)t0, 0); \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
|
||||
int length,
|
||||
WebPRescaler* const wrk) {
|
||||
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
|
||||
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
|
||||
const v4i32 zero = { 0 };
|
||||
|
||||
while (length >= 16) {
|
||||
v4u32 src0, src1, src2, src3;
|
||||
v16u8 out;
|
||||
LD_UW4(frow, 4, src0, src1, src2, src3);
|
||||
CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
|
||||
ST_UB(out, dst);
|
||||
length -= 16;
|
||||
frow += 16;
|
||||
dst += 16;
|
||||
}
|
||||
if (length > 0) {
|
||||
int x_out;
|
||||
if (length >= 12) {
|
||||
uint32_t val0_m, val1_m, val2_m;
|
||||
v4u32 src0, src1, src2;
|
||||
LD_UW3(frow, 4, src0, src1, src2);
|
||||
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
|
||||
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
|
||||
CALC_MULT_FIX_4(src2, scale, shift, val2_m);
|
||||
SW3(val0_m, val1_m, val2_m, dst, 4);
|
||||
length -= 12;
|
||||
frow += 12;
|
||||
dst += 12;
|
||||
} else if (length >= 8) {
|
||||
uint32_t val0_m, val1_m;
|
||||
v4u32 src0, src1;
|
||||
LD_UW2(frow, 4, src0, src1);
|
||||
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
|
||||
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
|
||||
SW2(val0_m, val1_m, dst, 4);
|
||||
length -= 8;
|
||||
frow += 8;
|
||||
dst += 8;
|
||||
} else if (length >= 4) {
|
||||
uint32_t val0_m;
|
||||
const v4u32 src0 = LD_UW(frow);
|
||||
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
|
||||
SW(val0_m, dst);
|
||||
length -= 4;
|
||||
frow += 4;
|
||||
dst += 4;
|
||||
}
|
||||
for (x_out = 0; x_out < length; ++x_out) {
|
||||
const uint32_t J = frow[x_out];
|
||||
const int v = (int)MULT_FIX(J, wrk->fy_scale);
|
||||
assert(v >= 0 && v <= 255);
|
||||
dst[x_out] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
|
||||
uint8_t* dst, int length,
|
||||
WebPRescaler* const wrk) {
|
||||
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
|
||||
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
|
||||
const v4i32 B1 = __msa_fill_w(B);
|
||||
const v4i32 A1 = __msa_fill_w(A);
|
||||
const v4i32 AB = __msa_ilvr_w(A1, B1);
|
||||
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
|
||||
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
|
||||
|
||||
while (length >= 16) {
|
||||
v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
|
||||
v16u8 t0, t1, t2, t3, t4, t5;
|
||||
LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
|
||||
LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
|
||||
CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
|
||||
CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
|
||||
PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
|
||||
t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
|
||||
ST_UB(t0, dst);
|
||||
frow += 16;
|
||||
irow += 16;
|
||||
dst += 16;
|
||||
length -= 16;
|
||||
}
|
||||
if (length > 0) {
|
||||
int x_out;
|
||||
if (length >= 12) {
|
||||
uint32_t val0_m, val1_m, val2_m;
|
||||
v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
|
||||
LD_UW3(frow, 4, frow0, frow1, frow2);
|
||||
LD_UW3(irow, 4, irow0, irow1, irow2);
|
||||
CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
|
||||
CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
|
||||
CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
|
||||
SW3(val0_m, val1_m, val2_m, dst, 4);
|
||||
frow += 12;
|
||||
irow += 12;
|
||||
dst += 12;
|
||||
length -= 12;
|
||||
} else if (length >= 8) {
|
||||
uint32_t val0_m, val1_m;
|
||||
v4u32 frow0, frow1, irow0, irow1;
|
||||
LD_UW2(frow, 4, frow0, frow1);
|
||||
LD_UW2(irow, 4, irow0, irow1);
|
||||
CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
|
||||
CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
|
||||
SW2(val0_m, val1_m, dst, 4);
|
||||
frow += 4;
|
||||
irow += 4;
|
||||
dst += 4;
|
||||
length -= 4;
|
||||
} else if (length >= 4) {
|
||||
uint32_t val0_m;
|
||||
const v4u32 frow0 = LD_UW(frow + 0);
|
||||
const v4u32 irow0 = LD_UW(irow + 0);
|
||||
CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
|
||||
SW(val0_m, dst);
|
||||
frow += 4;
|
||||
irow += 4;
|
||||
dst += 4;
|
||||
length -= 4;
|
||||
}
|
||||
for (x_out = 0; x_out < length; ++x_out) {
|
||||
const uint64_t I = (uint64_t)A * frow[x_out]
|
||||
+ (uint64_t)B * irow[x_out];
|
||||
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
|
||||
const int v = (int)MULT_FIX(J, wrk->fy_scale);
|
||||
assert(v >= 0 && v <= 255);
|
||||
dst[x_out] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
const rescaler_t* frow = wrk->frow;
|
||||
assert(!WebPRescalerOutputDone(wrk));
|
||||
assert(wrk->y_accum <= 0);
|
||||
assert(wrk->y_expand);
|
||||
assert(wrk->y_sub != 0);
|
||||
if (wrk->y_accum == 0) {
|
||||
ExportRowExpand_0(frow, dst, x_out_max, wrk);
|
||||
} else {
|
||||
ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
|
||||
uint8_t* dst, int length,
|
||||
const uint32_t yscale,
|
||||
WebPRescaler* const wrk) {
|
||||
const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
|
||||
const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
|
||||
const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
|
||||
const v4i32 zero = { 0 };
|
||||
|
||||
while (length >= 16) {
|
||||
v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
|
||||
v16u8 out;
|
||||
LD_UW4(frow, 4, src0, src1, src2, src3);
|
||||
CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
|
||||
frac0, frac1, frac2, frac3);
|
||||
LD_UW4(irow, 4, src0, src1, src2, src3);
|
||||
SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
|
||||
src0, src1, src2, src3);
|
||||
CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
|
||||
ST_UB(out, dst);
|
||||
ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
|
||||
frow += 16;
|
||||
irow += 16;
|
||||
dst += 16;
|
||||
length -= 16;
|
||||
}
|
||||
if (length > 0) {
|
||||
int x_out;
|
||||
if (length >= 12) {
|
||||
uint32_t val0_m, val1_m, val2_m;
|
||||
v4u32 src0, src1, src2, frac0, frac1, frac2;
|
||||
LD_UW3(frow, 4, src0, src1, src2);
|
||||
CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
|
||||
CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
|
||||
CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
|
||||
LD_UW3(irow, 4, src0, src1, src2);
|
||||
SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
|
||||
CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
|
||||
CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
|
||||
CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
|
||||
SW3(val0_m, val1_m, val2_m, dst, 4);
|
||||
ST_UW3(frac0, frac1, frac2, irow, 4);
|
||||
frow += 12;
|
||||
irow += 12;
|
||||
dst += 12;
|
||||
length -= 12;
|
||||
} else if (length >= 8) {
|
||||
uint32_t val0_m, val1_m;
|
||||
v4u32 src0, src1, frac0, frac1;
|
||||
LD_UW2(frow, 4, src0, src1);
|
||||
CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
|
||||
CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
|
||||
LD_UW2(irow, 4, src0, src1);
|
||||
SUB2(src0, frac0, src1, frac1, src0, src1);
|
||||
CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
|
||||
CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
|
||||
SW2(val0_m, val1_m, dst, 4);
|
||||
ST_UW2(frac0, frac1, irow, 4);
|
||||
frow += 8;
|
||||
irow += 8;
|
||||
dst += 8;
|
||||
length -= 8;
|
||||
} else if (length >= 4) {
|
||||
uint32_t val0_m;
|
||||
v4u32 frac0;
|
||||
v4u32 src0 = LD_UW(frow);
|
||||
CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
|
||||
src0 = LD_UW(irow);
|
||||
src0 = src0 - frac0;
|
||||
CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
|
||||
SW(val0_m, dst);
|
||||
ST_UW(frac0, irow);
|
||||
frow += 4;
|
||||
irow += 4;
|
||||
dst += 4;
|
||||
length -= 4;
|
||||
}
|
||||
for (x_out = 0; x_out < length; ++x_out) {
|
||||
const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
|
||||
const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
|
||||
assert(v >= 0 && v <= 255);
|
||||
dst[x_out] = v;
|
||||
irow[x_out] = frac;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
|
||||
int length,
|
||||
WebPRescaler* const wrk) {
|
||||
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
|
||||
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
|
||||
const v4i32 zero = { 0 };
|
||||
|
||||
while (length >= 16) {
|
||||
v4u32 src0, src1, src2, src3;
|
||||
v16u8 dst0;
|
||||
LD_UW4(irow, 4, src0, src1, src2, src3);
|
||||
CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
|
||||
ST_UB(dst0, dst);
|
||||
ST_SW4(zero, zero, zero, zero, irow, 4);
|
||||
length -= 16;
|
||||
irow += 16;
|
||||
dst += 16;
|
||||
}
|
||||
if (length > 0) {
|
||||
int x_out;
|
||||
if (length >= 12) {
|
||||
uint32_t val0_m, val1_m, val2_m;
|
||||
v4u32 src0, src1, src2;
|
||||
LD_UW3(irow, 4, src0, src1, src2);
|
||||
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
|
||||
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
|
||||
CALC_MULT_FIX_4(src2, scale, shift, val2_m);
|
||||
SW3(val0_m, val1_m, val2_m, dst, 4);
|
||||
ST_SW3(zero, zero, zero, irow, 4);
|
||||
length -= 12;
|
||||
irow += 12;
|
||||
dst += 12;
|
||||
} else if (length >= 8) {
|
||||
uint32_t val0_m, val1_m;
|
||||
v4u32 src0, src1;
|
||||
LD_UW2(irow, 4, src0, src1);
|
||||
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
|
||||
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
|
||||
SW2(val0_m, val1_m, dst, 4);
|
||||
ST_SW2(zero, zero, irow, 4);
|
||||
length -= 8;
|
||||
irow += 8;
|
||||
dst += 8;
|
||||
} else if (length >= 4) {
|
||||
uint32_t val0_m;
|
||||
const v4u32 src0 = LD_UW(irow + 0);
|
||||
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
|
||||
SW(val0_m, dst);
|
||||
ST_SW(zero, irow);
|
||||
length -= 4;
|
||||
irow += 4;
|
||||
dst += 4;
|
||||
}
|
||||
for (x_out = 0; x_out < length; ++x_out) {
|
||||
const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
|
||||
assert(v >= 0 && v <= 255);
|
||||
dst[x_out] = v;
|
||||
irow[x_out] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
const rescaler_t* frow = wrk->frow;
|
||||
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
|
||||
assert(!WebPRescalerOutputDone(wrk));
|
||||
assert(wrk->y_accum <= 0);
|
||||
assert(!wrk->y_expand);
|
||||
if (yscale) {
|
||||
ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
|
||||
} else {
|
||||
ExportRowShrink_1(irow, dst, x_out_max, wrk);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void WebPRescalerDspInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpand;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrink;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
||||
|
||||
WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
|
||||
|
||||
#endif // WEBP_USE_MSA
|
Loading…
Reference in New Issue
Block a user