mips msa vp8 loop filter optimization

average improvement ~2x-4x

Change-Id: I20c4f900ef95d99b18f9cf4db592cd352c2212eb
This commit is contained in:
Parag Salasakar 2015-07-08 12:41:00 +05:30
parent 892128f6ca
commit 0ea2684c2c
4 changed files with 1279 additions and 8 deletions

View File

@ -0,0 +1,826 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/mips/msa/vp8_macros_msa.h"
#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \
{ \
v16u8 p1_a_sub_q1, p0_a_sub_q0; \
\
p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \
p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \
p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \
p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \
mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \
mask = ((v16u8)mask <= b_limit); \
}
#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \
mask_in, hev_in) \
{ \
v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
\
p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80); \
p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80); \
q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80); \
q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80); \
\
filt = __msa_subs_s_b(p1_m, q1_m); \
\
filt = filt & (v16i8)hev_in; \
\
q0_sub_p0 = q0_m - p0_m; \
filt_sign = __msa_clti_s_b(filt, 0); \
\
cnst3h = __msa_ldi_h(3); \
q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
filt_r += q0_sub_p0_r; \
filt_r = __msa_sat_s_h(filt_r, 7); \
\
q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
filt_l += q0_sub_p0_l; \
filt_l = __msa_sat_s_h(filt_l, 7); \
\
filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
filt = filt & (v16i8)mask_in; \
\
cnst4b = __msa_ldi_b(4); \
filt1 = __msa_adds_s_b(filt, cnst4b); \
filt1 >>= 3; \
\
cnst3b = __msa_ldi_b(3); \
filt2 = __msa_adds_s_b(filt, cnst3b); \
filt2 >>= 3; \
\
q0_m = __msa_subs_s_b(q0_m, filt1); \
q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_m = __msa_adds_s_b(p0_m, filt2); \
p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80); \
\
filt = __msa_srari_b(filt1, 1); \
hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
filt = filt & (v16i8)hev_in; \
\
q1_m = __msa_subs_s_b(q1_m, filt); \
q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, filt); \
p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80); \
}
#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \
{ \
v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \
v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \
v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
\
p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
\
filt = __msa_subs_s_b(p1_m, q1_m); \
\
q0_sub_p0 = q0_m - p0_m; \
filt_sign = __msa_clti_s_b(filt, 0); \
\
cnst3h = __msa_ldi_h(3); \
q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \
q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
q0_sub_p0_r *= cnst3h; \
filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
filt_r += q0_sub_p0_r; \
filt_r = __msa_sat_s_h(filt_r, 7); \
\
q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
q0_sub_p0_l *= cnst3h; \
filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
filt_l += q0_sub_p0_l; \
filt_l = __msa_sat_s_h(filt_l, 7); \
\
filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
filt = filt & (v16i8)(mask); \
\
cnst4b = __msa_ldi_b(4); \
filt1 = __msa_adds_s_b(filt, cnst4b); \
filt1 >>= 3; \
\
cnst3b = __msa_ldi_b(3); \
filt2 = __msa_adds_s_b(filt, cnst3b); \
filt2 >>= 3; \
\
q0_m = __msa_subs_s_b(q0_m, filt1); \
p0_m = __msa_adds_s_b(p0_m, filt2); \
q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \
}
#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
{ \
v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \
v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \
v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \
v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \
\
cnst3h = __msa_ldi_h(3); \
\
p2_m = (v16i8)__msa_xori_b(p2, 0x80); \
p1_m = (v16i8)__msa_xori_b(p1, 0x80); \
p0_m = (v16i8)__msa_xori_b(p0, 0x80); \
q0_m = (v16i8)__msa_xori_b(q0, 0x80); \
q1_m = (v16i8)__msa_xori_b(q1, 0x80); \
q2_m = (v16i8)__msa_xori_b(q2, 0x80); \
\
filt = __msa_subs_s_b(p1_m, q1_m); \
q0_sub_p0 = q0_m - p0_m; \
q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \
filt_sign = __msa_clti_s_b(filt, 0); \
\
q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
q0_sub_p0_r *= cnst3h; \
filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
filt_r = filt_r + q0_sub_p0_r; \
filt_r = __msa_sat_s_h(filt_r, 7); \
\
q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
q0_sub_p0_l *= cnst3h; \
filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
filt_l = filt_l + q0_sub_p0_l; \
filt_l = __msa_sat_s_h(filt_l, 7); \
\
filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
filt = filt & (v16i8)mask; \
filt2 = filt & (v16i8)hev; \
\
hev = __msa_xori_b(hev, 0xff); \
filt = filt & (v16i8)hev; \
cnst4b = __msa_ldi_b(4); \
filt1 = __msa_adds_s_b(filt2, cnst4b); \
filt1 >>= 3; \
cnst3b = __msa_ldi_b(3); \
filt2 = __msa_adds_s_b(filt2, cnst3b); \
filt2 >>= 3; \
q0_m = __msa_subs_s_b(q0_m, filt1); \
p0_m = __msa_adds_s_b(p0_m, filt2); \
\
filt_sign = __msa_clti_s_b(filt, 0); \
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
\
cnst27h = __msa_ldi_h(27); \
cnst63h = __msa_ldi_h(63); \
\
u_r = filt_r * cnst27h; \
u_r += cnst63h; \
u_r >>= 7; \
u_r = __msa_sat_s_h(u_r, 7); \
u_l = filt_l * cnst27h; \
u_l += cnst63h; \
u_l >>= 7; \
u_l = __msa_sat_s_h(u_l, 7); \
u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
q0_m = __msa_subs_s_b(q0_m, u); \
q0 = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_m = __msa_adds_s_b(p0_m, u); \
p0 = __msa_xori_b((v16u8)p0_m, 0x80); \
cnst18h = __msa_ldi_h(18); \
u_r = filt_r * cnst18h; \
u_r += cnst63h; \
u_r >>= 7; \
u_r = __msa_sat_s_h(u_r, 7); \
\
u_l = filt_l * cnst18h; \
u_l += cnst63h; \
u_l >>= 7; \
u_l = __msa_sat_s_h(u_l, 7); \
u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
q1_m = __msa_subs_s_b(q1_m, u); \
q1 = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, u); \
p1 = __msa_xori_b((v16u8)p1_m, 0x80); \
u_r = filt_r << 3; \
u_r += filt_r + cnst63h; \
u_r >>= 7; \
u_r = __msa_sat_s_h(u_r, 7); \
\
u_l = filt_l << 3; \
u_l += filt_l + cnst63h; \
u_l >>= 7; \
u_l = __msa_sat_s_h(u_l, 7); \
u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
q2_m = __msa_subs_s_b(q2_m, u); \
q2 = __msa_xori_b((v16u8)q2_m, 0x80); \
p2_m = __msa_adds_s_b(p2_m, u); \
p2 = __msa_xori_b((v16u8)p2_m, 0x80); \
}
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
q0_in, q1_in, q2_in, q3_in, \
limit_in, b_limit_in, thresh_in, \
hev_out, mask_out, flat_out) \
{ \
v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
\
p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \
p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \
p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \
q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \
q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \
q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \
p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \
p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \
flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
hev_out = (thresh_in) < (v16u8)flat_out; \
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
p1_asub_q1_m >>= 1; \
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
mask_out = (b_limit_in) < p0_asub_q0_m; \
mask_out = __msa_max_u_b(flat_out, mask_out); \
p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
mask_out = (limit_in) < (v16u8)mask_out; \
mask_out = __msa_xori_b(mask_out, 0xff); \
}
#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \
{ \
uint16_t tmp0_h; \
uint32_t tmp0_w; \
\
tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx); \
tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx); \
SW(tmp0_w, pdst); \
SH(tmp0_h, pdst + stride); \
}
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit0_ptr,
const uint8_t *limit0_ptr,
const uint8_t *thresh0_ptr,
const uint8_t *b_limit1_ptr,
const uint8_t *limit1_ptr,
const uint8_t *thresh1_ptr)
{
v16u8 mask, hev, flat;
v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
hev, mask, flat);
VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit0_ptr,
const uint8_t *limit0_ptr,
const uint8_t *thresh0_ptr,
const uint8_t *b_limit1_ptr,
const uint8_t *limit1_ptr,
const uint8_t *thresh1_ptr)
{
v16u8 mask, hev, flat;
v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
LD_UB8(src - 4 + (8 * pitch), pitch,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p3, p2, p1, p0, q0, q1, q2, q3);
thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
hev, mask, flat);
VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
src -= 2;
ST4x8_UB(tmp2, tmp3, src, pitch);
src += (8 * pitch);
ST4x8_UB(tmp4, tmp5, src, pitch);
}
static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
const uint8_t b_limit_in,
const uint8_t limit_in,
const uint8_t thresh_in)
{
uint8_t *temp_src;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
b_limit = (v16u8)__msa_fill_b(b_limit_in);
limit = (v16u8)__msa_fill_b(limit_in);
thresh = (v16u8)__msa_fill_b(thresh_in);
temp_src = src - (pitch << 2);
LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask, flat);
VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
temp_src = src - 3 * pitch;
ST_UB4(p2, p1, p0, q0, temp_src, pitch);
temp_src += (4 * pitch);
ST_UB2(q1, q2, temp_src, pitch);
}
static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
int32_t pitch,
const uint8_t b_limit_in,
const uint8_t limit_in,
const uint8_t thresh_in)
{
uint8_t *temp_src;
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
b_limit = (v16u8)__msa_fill_b(b_limit_in);
limit = (v16u8)__msa_fill_b(limit_in);
thresh = (v16u8)__msa_fill_b(thresh_in);
temp_src = src_u - (pitch << 2);
LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
temp_src = src_v - (pitch << 2);
LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask, flat);
VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
p2_d = __msa_copy_u_d((v2i64)p2, 0);
p1_d = __msa_copy_u_d((v2i64)p1, 0);
p0_d = __msa_copy_u_d((v2i64)p0, 0);
q0_d = __msa_copy_u_d((v2i64)q0, 0);
q1_d = __msa_copy_u_d((v2i64)q1, 0);
q2_d = __msa_copy_u_d((v2i64)q2, 0);
src_u -= (pitch * 3);
SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
src_u += 4 * pitch;
SD(q1_d, src_u);
src_u += pitch;
SD(q2_d, src_u);
p2_d = __msa_copy_u_d((v2i64)p2, 1);
p1_d = __msa_copy_u_d((v2i64)p1, 1);
p0_d = __msa_copy_u_d((v2i64)p0, 1);
q0_d = __msa_copy_u_d((v2i64)q0, 1);
q1_d = __msa_copy_u_d((v2i64)q1, 1);
q2_d = __msa_copy_u_d((v2i64)q2, 1);
src_v -= (pitch * 3);
SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
src_v += 4 * pitch;
SD(q1_d, src_v);
src_v += pitch;
SD(q2_d, src_v);
}
static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch,
const uint8_t b_limit_in,
const uint8_t limit_in,
const uint8_t thresh_in)
{
uint8_t *temp_src;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
v16u8 row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
b_limit = (v16u8)__msa_fill_b(b_limit_in);
limit = (v16u8)__msa_fill_b(limit_in);
thresh = (v16u8)__msa_fill_b(thresh_in);
temp_src = src - 4;
LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
temp_src += (8 * pitch);
LD_UB8(temp_src, pitch,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask, flat);
VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
ILVRL_B2_SH(q2, q1, tmp2, tmp5);
temp_src = src - 3;
VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
temp_src += pitch;
VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
}
static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
int32_t pitch,
const uint8_t b_limit_in,
const uint8_t limit_in,
const uint8_t thresh_in)
{
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
v16u8 row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
b_limit = (v16u8)__msa_fill_b(b_limit_in);
limit = (v16u8)__msa_fill_b(limit_in);
thresh = (v16u8)__msa_fill_b(thresh_in);
LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
LD_UB8(src_v - 4, pitch,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask, flat);
VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
ILVRL_B2_SH(q2, q1, tmp2, tmp5);
src_u -= 3;
VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
src_u += pitch;
VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
src_v -= 3;
VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
src_v += pitch;
VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
}
void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr)
{
v16u8 p1, p0, q1, q0;
v16u8 mask, b_limit;
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
ST_UB2(p0, q0, (src - pitch), pitch);
}
void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr)
{
uint8_t *temp_src;
v16u8 p1, p0, q1, q0;
v16u8 mask, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
v16u8 row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1;
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
temp_src = src - 2;
LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
temp_src += (8 * pitch);
LD_UB8(temp_src, pitch,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p1, p0, q0, q1);
VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
ILVRL_B2_SH(q0, p0, tmp1, tmp0);
src -= 1;
ST2x4_UB(tmp1, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp1, 4, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 4, src, pitch);
src += 4 * pitch;
}
static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
int32_t pitch,
const uint8_t b_limit_in,
const uint8_t limit_in,
const uint8_t thresh_in)
{
uint64_t p1_d, p0_d, q0_d, q1_d;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
thresh = (v16u8)__msa_fill_b(thresh_in);
limit = (v16u8)__msa_fill_b(limit_in);
b_limit = (v16u8)__msa_fill_b(b_limit_in);
src_u = src_u - (pitch << 2);
LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
src_u += (5 * pitch);
src_v = src_v - (pitch << 2);
LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
src_v += (5 * pitch);
/* right 8 element of p3 are u pixel and
left 8 element of p3 are v pixel */
ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask, flat);
VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
p1_d = __msa_copy_u_d((v2i64)p1, 0);
p0_d = __msa_copy_u_d((v2i64)p0, 0);
q0_d = __msa_copy_u_d((v2i64)q0, 0);
q1_d = __msa_copy_u_d((v2i64)q1, 0);
SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
p1_d = __msa_copy_u_d((v2i64)p1, 1);
p0_d = __msa_copy_u_d((v2i64)p0, 1);
q0_d = __msa_copy_u_d((v2i64)q0, 1);
q1_d = __msa_copy_u_d((v2i64)q1, 1);
SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
}
static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
int32_t pitch,
const uint8_t b_limit_in,
const uint8_t limit_in,
const uint8_t thresh_in)
{
uint8_t *temp_src_u, *temp_src_v;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
v16u8 row9, row10, row11, row12, row13, row14, row15;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
thresh = (v16u8)__msa_fill_b(thresh_in);
limit = (v16u8)__msa_fill_b(limit_in);
b_limit = (v16u8)__msa_fill_b(b_limit_in);
LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
LD_UB8(src_v - 4, pitch,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask, flat);
VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
temp_src_u = src_u - 2;
ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_u += 4 * pitch;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_v = src_v - 2;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
temp_src_v += 4 * pitch;
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
}
void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u,
uint8_t *src_v, int32_t pitch_y,
int32_t pitch_u_v,
loop_filter_info *lpf_info_ptr)
{
mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y,
*lpf_info_ptr->mblim,
*lpf_info_ptr->lim,
*lpf_info_ptr->hev_thr);
if (src_u)
{
mbloop_filter_horizontal_edge_uv_msa(src_u, src_v, pitch_u_v,
*lpf_info_ptr->mblim,
*lpf_info_ptr->lim,
*lpf_info_ptr->hev_thr);
}
}
void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u,
uint8_t *src_v, int32_t pitch_y,
int32_t pitch_u_v,
loop_filter_info *lpf_info_ptr)
{
mbloop_filter_vertical_edge_y_msa(src_y, pitch_y,
*lpf_info_ptr->mblim,
*lpf_info_ptr->lim,
*lpf_info_ptr->hev_thr);
if (src_u)
{
mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v,
*lpf_info_ptr->mblim,
*lpf_info_ptr->lim,
*lpf_info_ptr->hev_thr);
}
}
void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u,
uint8_t *src_v, int32_t pitch_y,
int32_t pitch_u_v,
loop_filter_info *lpf_info_ptr)
{
loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr);
loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr);
loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr);
if (src_u)
{
loop_filter_horizontal_edge_uv_msa(src_u + (4 * pitch_u_v),
src_v + (4 * pitch_u_v),
pitch_u_v,
*lpf_info_ptr->blim,
*lpf_info_ptr->lim,
*lpf_info_ptr->hev_thr);
}
}
void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u,
uint8_t *src_v, int32_t pitch_y,
int32_t pitch_u_v,
loop_filter_info *lpf_info_ptr)
{
loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr);
loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr);
loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr,
lpf_info_ptr->blim,
lpf_info_ptr->lim,
lpf_info_ptr->hev_thr);
if (src_u)
{
loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v,
*lpf_info_ptr->blim,
*lpf_info_ptr->lim,
*lpf_info_ptr->hev_thr);
}
}
void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y,
const uint8_t *b_limit_ptr)
{
vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y),
pitch_y, b_limit_ptr);
vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y),
pitch_y, b_limit_ptr);
vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y),
pitch_y, b_limit_ptr);
}
void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y,
const uint8_t *b_limit_ptr)
{
vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr);
vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr);
vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr);
}

View File

@ -32,6 +32,210 @@
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
#if (__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint32_t val_m; \
\
asm volatile ( \
"lw %[val_m], %[psrc_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_m] "m" (*psrc_m) \
); \
\
val_m; \
})
#if (__mips == 64)
#define LD(psrc) \
({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint64_t val_m = 0; \
\
asm volatile ( \
"ld %[val_m], %[psrc_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_m] "m" (*psrc_m) \
); \
\
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint32_t val0_m, val1_m; \
uint64_t val_m = 0; \
\
val0_m = LW(psrc_m); \
val1_m = LW(psrc_m + 4); \
\
val_m = (uint64_t)(val1_m); \
val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
\
val_m; \
})
#endif // (__mips == 64)
#define SH(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *)(pdst); \
const uint16_t val_m = (val); \
\
asm volatile ( \
"sh %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#define SW(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *)(pdst); \
const uint32_t val_m = (val); \
\
asm volatile ( \
"sw %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#define SD(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *)(pdst); \
const uint64_t val_m = (val); \
\
asm volatile ( \
"sd %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint32_t val_m; \
\
asm volatile ( \
"ulw %[val_m], %[psrc_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_m] "m" (*psrc_m) \
); \
\
val_m; \
})
#if (__mips == 64)
#define LD(psrc) \
({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint64_t val_m = 0; \
\
asm volatile ( \
"uld %[val_m], %[psrc_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_m] "m" (*psrc_m) \
); \
\
val_m; \
})
#else // !(__mips == 64)
#define LD(psrc) \
({ \
const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
uint32_t val0_m, val1_m; \
uint64_t val_m = 0; \
\
val0_m = LW(psrc_m1); \
val1_m = LW(psrc_m1 + 4); \
\
val_m = (uint64_t)(val1_m); \
val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
\
val_m; \
})
#endif // (__mips == 64)
#define SH(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *)(pdst); \
const uint16_t val_m = (val); \
\
asm volatile ( \
"ush %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#define SW(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *)(pdst); \
const uint32_t val_m = (val); \
\
asm volatile ( \
"usw %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#define SD(val, pdst) \
{ \
uint8_t *pdst_m1 = (uint8_t *)(pdst); \
uint32_t val0_m, val1_m; \
\
val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
\
SW(val0_m, pdst_m1); \
SW(val1_m, pdst_m1 + 4); \
}
#endif // (__mips_isa_rev >= 6)
/* Description : Store 4 words with stride
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
Details : Store word from 'in0' to (pdst)
Store word from 'in1' to (pdst + stride)
Store word from 'in2' to (pdst + 2 * stride)
Store word from 'in3' to (pdst + 3 * stride)
*/
#define SW4(in0, in1, in2, in3, pdst, stride) \
{ \
SW(in0, (pdst)); \
SW(in1, (pdst) + stride); \
SW(in2, (pdst) + 2 * stride); \
SW(in3, (pdst) + 3 * stride); \
}
/* Description : Store 4 double words with stride
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
Details : Store double word from 'in0' to (pdst)
Store double word from 'in1' to (pdst + stride)
Store double word from 'in2' to (pdst + 2 * stride)
Store double word from 'in3' to (pdst + 3 * stride)
*/
#define SD4(in0, in1, in2, in3, pdst, stride) \
{ \
SD(in0, (pdst)); \
SD(in1, (pdst) + stride); \
SD(in2, (pdst) + 2 * stride); \
SD(in3, (pdst) + 3 * stride); \
}
/* Description : Load vectors with 16 byte elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
@ -55,6 +259,14 @@
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
#define LD_B8(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
}
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
/* Description : Load vectors with 8 halfword elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
@ -85,6 +297,7 @@
ST_B(RTYPE, in0, (pdst)); \
ST_B(RTYPE, in1, (pdst) + stride); \
}
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
@ -106,6 +319,64 @@
}
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
/* Description : Store 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
Details : Index 'stidx' halfword element from 'in' vector is copied to
the GP register and stored to (pdst)
Index 'stidx+1' halfword element from 'in' vector is copied to
the GP register and stored to (pdst + stride)
Index 'stidx+2' halfword element from 'in' vector is copied to
the GP register and stored to (pdst + 2 * stride)
Index 'stidx+3' halfword element from 'in' vector is copied to
the GP register and stored to (pdst + 3 * stride)
*/
#define ST2x4_UB(in, stidx, pdst, stride) \
{ \
uint16_t out0_m, out1_m, out2_m, out3_m; \
uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
\
out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
\
SH(out0_m, pblk_2x4_m); \
SH(out1_m, pblk_2x4_m + stride); \
SH(out2_m, pblk_2x4_m + 2 * stride); \
SH(out3_m, pblk_2x4_m + 3 * stride); \
}
/* Description : Store 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride
Details : 'Idx0' word element from input vector 'in0' is copied to the
GP register and stored to (pdst)
'Idx1' word element from input vector 'in0' is copied to the
GP register and stored to (pdst + stride)
'Idx2' word element from input vector 'in0' is copied to the
GP register and stored to (pdst + 2 * stride)
'Idx3' word element from input vector 'in0' is copied to the
GP register and stored to (pdst + 3 * stride)
*/
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
{ \
uint32_t out0_m, out1_m, out2_m, out3_m; \
uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
\
out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
\
SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
}
#define ST4x8_UB(in0, in1, pdst, stride) \
{ \
uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
\
ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
}
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
@ -162,6 +433,76 @@
out_m; \
})
/* Description : Interleave even byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Even byte elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
}
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
/* Description : Interleave even halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Even halfword elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
}
#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
/* Description : Interleave even word elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Even word elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
}
#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
/* Description : Interleave even double word elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Even double word elements of 'in0' and 'in1' are interleaved
and written to 'out0'
*/
#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
}
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
/* Description : Interleave left half of byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Left half of byte elements of 'in0' and 'in1' are interleaved
and written to 'out0'.
*/
#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
}
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
/* Description : Interleave left half of halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
@ -203,6 +544,8 @@
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
}
#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \
@ -260,6 +603,14 @@
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \
{ \
ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
}
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
@ -267,6 +618,13 @@
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
@ -480,6 +838,88 @@
out3 = in0 - in3; \
}
/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15
Outputs - out0, out1, out2, out3
Return Type - unsigned byte
*/
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3) \
{ \
v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \
\
ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \
\
ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
\
tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
\
tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
\
tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1); \
tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m); \
out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
}
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - unsigned byte
*/
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
\
ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
\
tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
\
ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
}
/* Description : Transpose 8x4 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7

View File

@ -51,57 +51,61 @@ $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
# Loopfilter
#
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/;
$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/;
$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/;
$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/;
$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/;
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/;
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/;
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/;
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
#
# IDCT

View File

@ -115,6 +115,7 @@ VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c
# common (c)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
# common (c)