Added MSA optimized simple edge filtering functions

1. SimpleVFilter16
2. SimpleHFilter16
3. SimpleVFilter16i
4. SimpleHFilter16i

Change-Id: Ib330e01960623aeeed1bdb5bc8155cc6657556f9
This commit is contained in:
Parag Salasakar 2016-06-23 12:16:50 +05:30
parent 1ebf193c2c
commit a6621bacf3
2 changed files with 165 additions and 0 deletions

View File

@ -307,6 +307,35 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4); \
} while (0)
#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do { \
v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \
const v16i8 cnst4b = __msa_ldi_b(4); \
const v16i8 cnst3b = __msa_ldi_b(3); \
\
FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m); \
filt = __msa_subs_s_b(p1_m, q1_m); \
FILT_VAL(q0_m, p0_m, mask, filt); \
filt1 = __msa_adds_s_b(filt, cnst4b); \
filt1 = SRAI_B(filt1, 3); \
filt2 = __msa_adds_s_b(filt, cnst3b); \
filt2 = SRAI_B(filt2, 3); \
q0_m = __msa_subs_s_b(q0_m, filt1); \
p0_m = __msa_adds_s_b(p0_m, filt2); \
q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \
} while (0)
#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do { \
v16u8 p1_a_sub_q1, p0_a_sub_q0; \
\
p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \
p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \
p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \
p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \
mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \
mask = (mask <= b_limit); \
} while (0)
static void VFilter16(uint8_t *src, int stride,
int b_limit_in, int limit_in, int thresh_in) {
uint8_t *ptemp = src - 4 * stride;
@ -592,6 +621,55 @@ static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
}
static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
v16u8 p1, p0, q1, q0, mask;
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
ST_UB2(p0, q0, src - stride, stride);
}
static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1;
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
uint8_t* ptemp_src = src - 2;
LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
LD_UB8(ptemp_src + 8 * stride, stride,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p1, p0, q0, q1);
LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
ILVRL_B2_SH(q0, p0, tmp1, tmp0);
ptemp_src += 1;
ST2x4_UB(tmp1, 0, ptemp_src, stride);
ptemp_src += 4 * stride;
ST2x4_UB(tmp1, 4, ptemp_src, stride);
ptemp_src += 4 * stride;
ST2x4_UB(tmp0, 0, ptemp_src, stride);
ptemp_src += 4 * stride;
ST2x4_UB(tmp0, 4, ptemp_src, stride);
ptemp_src += 4 * stride;
}
static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
SimpleVFilter16(src_y + 4 * stride, stride, b_limit_in);
SimpleVFilter16(src_y + 8 * stride, stride, b_limit_in);
SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
}
static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
SimpleHFilter16(src_y + 4, stride, b_limit_in);
SimpleHFilter16(src_y + 8, stride, b_limit_in);
SimpleHFilter16(src_y + 12, stride, b_limit_in);
}
//------------------------------------------------------------------------------
// Entry point
@ -611,6 +689,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
VP8HFilter8 = HFilter8;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
}
#else // !WEBP_USE_MSA

View File

@ -243,6 +243,32 @@
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
/* Description : Store 2x4 byte block to destination memory from input vector
* Arguments : Inputs - in, stidx, pdst, stride
* Details : Index 'stidx' halfword element from 'in' vector is copied to
* the GP register and stored to (pdst)
* Index 'stidx+1' halfword element from 'in' vector is copied to
* the GP register and stored to (pdst + stride)
* Index 'stidx+2' halfword element from 'in' vector is copied to
* the GP register and stored to (pdst + 2 * stride)
* Index 'stidx+3' halfword element from 'in' vector is copied to
* the GP register and stored to (pdst + 3 * stride)
*/
#define ST2x4_UB(in, stidx, pdst, stride) { \
uint8_t* pblk_2x4_m = (uint8_t*)pdst; \
const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \
const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \
const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \
const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \
SH(out0_m, pblk_2x4_m); \
pblk_2x4_m += stride; \
SH(out1_m, pblk_2x4_m); \
pblk_2x4_m += stride; \
SH(out2_m, pblk_2x4_m); \
pblk_2x4_m += stride; \
SH(out3_m, pblk_2x4_m); \
}
/* Description : Store 4x4 byte block to destination memory from input vector
* Arguments : Inputs - in0, in1, pdst, stride
* Details : 'Idx0' word element from input vector 'in0' is copied to the
@ -418,6 +444,22 @@
#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
/* Description : Interleave even word elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Even word elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
*/
#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
}
#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
/* Description : Interleave even-odd word elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
@ -436,6 +478,24 @@
#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
/* Description : Interleave even-odd half-word elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Even half-word elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
* Odd half-word elements of 'in2' and 'in3' are interleaved
* and written to 'out1'
*/
#define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
}
#define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
#define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
#define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
#define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
/* Description : Interleave even double word elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
@ -450,6 +510,7 @@
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
#define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
/* Description : Interleave left half of byte elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
@ -708,6 +769,28 @@
out3 = in0 - in3; \
}
/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
* in8, in9, in10, in11, in12, in13, in14, in15
* Outputs - out0, out1, out2, out3
* Return Type - unsigned byte
*/
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3) { \
v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \
ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \
ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \
ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \
ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \
ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \
}
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
* in8, in9, in10, in11, in12, in13, in14, in15