Added MSA optimized simple edge filtering functions
1. SimpleVFilter16 2. SimpleHFilter16 3. SimpleVFilter16i 4. SimpleHFilter16i Change-Id: Ib330e01960623aeeed1bdb5bc8155cc6657556f9
This commit is contained in:
parent
1ebf193c2c
commit
a6621bacf3
@ -307,6 +307,35 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4); \
|
||||
} while (0)
|
||||
|
||||
#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do { \
|
||||
v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \
|
||||
const v16i8 cnst4b = __msa_ldi_b(4); \
|
||||
const v16i8 cnst3b = __msa_ldi_b(3); \
|
||||
\
|
||||
FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m); \
|
||||
filt = __msa_subs_s_b(p1_m, q1_m); \
|
||||
FILT_VAL(q0_m, p0_m, mask, filt); \
|
||||
filt1 = __msa_adds_s_b(filt, cnst4b); \
|
||||
filt1 = SRAI_B(filt1, 3); \
|
||||
filt2 = __msa_adds_s_b(filt, cnst3b); \
|
||||
filt2 = SRAI_B(filt2, 3); \
|
||||
q0_m = __msa_subs_s_b(q0_m, filt1); \
|
||||
p0_m = __msa_adds_s_b(p0_m, filt2); \
|
||||
q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \
|
||||
p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \
|
||||
} while (0)
|
||||
|
||||
#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do { \
|
||||
v16u8 p1_a_sub_q1, p0_a_sub_q0; \
|
||||
\
|
||||
p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \
|
||||
p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \
|
||||
p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \
|
||||
p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \
|
||||
mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \
|
||||
mask = (mask <= b_limit); \
|
||||
} while (0)
|
||||
|
||||
static void VFilter16(uint8_t *src, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
uint8_t *ptemp = src - 4 * stride;
|
||||
@ -592,6 +621,55 @@ static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
|
||||
}
|
||||
|
||||
static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
|
||||
v16u8 p1, p0, q1, q0, mask;
|
||||
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
|
||||
|
||||
LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
|
||||
LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
|
||||
LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
|
||||
ST_UB2(p0, q0, src - stride, stride);
|
||||
}
|
||||
|
||||
static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
|
||||
v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
|
||||
v8i16 tmp0, tmp1;
|
||||
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
|
||||
uint8_t* ptemp_src = src - 2;
|
||||
|
||||
LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
|
||||
LD_UB8(ptemp_src + 8 * stride, stride,
|
||||
row8, row9, row10, row11, row12, row13, row14, row15);
|
||||
TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
row8, row9, row10, row11, row12, row13, row14, row15,
|
||||
p1, p0, q0, q1);
|
||||
LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
|
||||
LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
|
||||
ILVRL_B2_SH(q0, p0, tmp1, tmp0);
|
||||
ptemp_src += 1;
|
||||
ST2x4_UB(tmp1, 0, ptemp_src, stride);
|
||||
ptemp_src += 4 * stride;
|
||||
ST2x4_UB(tmp1, 4, ptemp_src, stride);
|
||||
ptemp_src += 4 * stride;
|
||||
ST2x4_UB(tmp0, 0, ptemp_src, stride);
|
||||
ptemp_src += 4 * stride;
|
||||
ST2x4_UB(tmp0, 4, ptemp_src, stride);
|
||||
ptemp_src += 4 * stride;
|
||||
}
|
||||
|
||||
static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
|
||||
SimpleVFilter16(src_y + 4 * stride, stride, b_limit_in);
|
||||
SimpleVFilter16(src_y + 8 * stride, stride, b_limit_in);
|
||||
SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
|
||||
}
|
||||
|
||||
static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
|
||||
SimpleHFilter16(src_y + 4, stride, b_limit_in);
|
||||
SimpleHFilter16(src_y + 8, stride, b_limit_in);
|
||||
SimpleHFilter16(src_y + 12, stride, b_limit_in);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
@ -611,6 +689,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
|
||||
VP8HFilter8 = HFilter8;
|
||||
VP8VFilter8i = VFilter8i;
|
||||
VP8HFilter8i = HFilter8i;
|
||||
VP8SimpleVFilter16 = SimpleVFilter16;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16;
|
||||
VP8SimpleVFilter16i = SimpleVFilter16i;
|
||||
VP8SimpleHFilter16i = SimpleHFilter16i;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
||||
|
@ -243,6 +243,32 @@
|
||||
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
|
||||
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
|
||||
|
||||
/* Description : Store 2x4 byte block to destination memory from input vector
|
||||
* Arguments : Inputs - in, stidx, pdst, stride
|
||||
* Details : Index 'stidx' halfword element from 'in' vector is copied to
|
||||
* the GP register and stored to (pdst)
|
||||
* Index 'stidx+1' halfword element from 'in' vector is copied to
|
||||
* the GP register and stored to (pdst + stride)
|
||||
* Index 'stidx+2' halfword element from 'in' vector is copied to
|
||||
* the GP register and stored to (pdst + 2 * stride)
|
||||
* Index 'stidx+3' halfword element from 'in' vector is copied to
|
||||
* the GP register and stored to (pdst + 3 * stride)
|
||||
*/
|
||||
#define ST2x4_UB(in, stidx, pdst, stride) { \
|
||||
uint8_t* pblk_2x4_m = (uint8_t*)pdst; \
|
||||
const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \
|
||||
const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \
|
||||
const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \
|
||||
const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \
|
||||
SH(out0_m, pblk_2x4_m); \
|
||||
pblk_2x4_m += stride; \
|
||||
SH(out1_m, pblk_2x4_m); \
|
||||
pblk_2x4_m += stride; \
|
||||
SH(out2_m, pblk_2x4_m); \
|
||||
pblk_2x4_m += stride; \
|
||||
SH(out3_m, pblk_2x4_m); \
|
||||
}
|
||||
|
||||
/* Description : Store 4x4 byte block to destination memory from input vector
|
||||
* Arguments : Inputs - in0, in1, pdst, stride
|
||||
* Details : 'Idx0' word element from input vector 'in0' is copied to the
|
||||
@ -418,6 +444,22 @@
|
||||
#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
|
||||
#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even word elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even word elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
*/
|
||||
#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
|
||||
out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
|
||||
}
|
||||
#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
|
||||
#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even-odd word elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
@ -436,6 +478,24 @@
|
||||
#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
|
||||
#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even-odd half-word elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even half-word elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
* Odd half-word elements of 'in2' and 'in3' are interleaved
|
||||
* and written to 'out1'
|
||||
*/
|
||||
#define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
|
||||
out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
|
||||
}
|
||||
#define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
|
||||
#define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
|
||||
#define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
|
||||
#define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even double word elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
@ -450,6 +510,7 @@
|
||||
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
|
||||
#define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave left half of byte elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
@ -708,6 +769,28 @@
|
||||
out3 = in0 - in3; \
|
||||
}
|
||||
|
||||
/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
* in8, in9, in10, in11, in12, in13, in14, in15
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Return Type - unsigned byte
|
||||
*/
|
||||
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3) { \
|
||||
v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \
|
||||
ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \
|
||||
ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
|
||||
ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \
|
||||
ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \
|
||||
ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
|
||||
ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
|
||||
ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
|
||||
ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \
|
||||
ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
|
||||
ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \
|
||||
}
|
||||
|
||||
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
* in8, in9, in10, in11, in12, in13, in14, in15
|
||||
|
Loading…
Reference in New Issue
Block a user