Merge "mips msa vp9 temporal filter optimization"
This commit is contained in:
commit
f3a1295cff
@ -1077,7 +1077,7 @@ add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const stru
|
||||
specialize qw/vp9_full_range_search/;
|
||||
|
||||
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
|
||||
specialize qw/vp9_temporal_filter_apply sse2/;
|
||||
specialize qw/vp9_temporal_filter_apply sse2 msa/;
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
|
289
vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
Normal file
289
vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
Normal file
@ -0,0 +1,289 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
|
||||
static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
|
||||
uint32_t stride,
|
||||
uint8_t *frm2_ptr,
|
||||
int32_t filt_sth,
|
||||
int32_t filt_wgt,
|
||||
uint32_t *acc,
|
||||
uint16_t *cnt) {
|
||||
uint32_t row;
|
||||
uint64_t f0, f1, f2, f3;
|
||||
v16i8 frm2, frm1 = { 0 };
|
||||
v16i8 frm4, frm3 = { 0 };
|
||||
v16u8 frm_r, frm_l;
|
||||
v8i16 frm2_r, frm2_l;
|
||||
v8i16 diff0, diff1, mod0_h, mod1_h;
|
||||
v4i32 cnst3, cnst16, filt_wt, strength;
|
||||
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
|
||||
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
|
||||
v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
|
||||
v4i32 acc0, acc1, acc2, acc3;
|
||||
v8i16 cnt0, cnt1;
|
||||
|
||||
filt_wt = __msa_fill_w(filt_wgt);
|
||||
strength = __msa_fill_w(filt_sth);
|
||||
cnst3 = __msa_ldi_w(3);
|
||||
cnst16 = __msa_ldi_w(16);
|
||||
|
||||
for (row = 2; row--;) {
|
||||
LD4(frm1_ptr, stride, f0, f1, f2, f3);
|
||||
frm1_ptr += (4 * stride);
|
||||
|
||||
LD_SB2(frm2_ptr, 16, frm2, frm4);
|
||||
frm2_ptr += 32;
|
||||
|
||||
LD_SW2(acc, 4, acc0, acc1);
|
||||
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||
|
||||
INSERT_D2_SB(f0, f1, frm1);
|
||||
INSERT_D2_SB(f2, f3, frm3);
|
||||
ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
|
||||
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
|
||||
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
|
||||
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||
|
||||
diff0_r = (mod0_w < cnst16);
|
||||
diff0_l = (mod1_w < cnst16);
|
||||
diff1_r = (mod2_w < cnst16);
|
||||
diff1_l = (mod3_w < cnst16);
|
||||
|
||||
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
mod0_w = diff0_r & mod0_w;
|
||||
mod1_w = diff0_l & mod1_w;
|
||||
mod2_w = diff1_r & mod2_w;
|
||||
mod3_w = diff1_l & mod3_w;
|
||||
|
||||
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||
cnt += 16;
|
||||
|
||||
UNPCK_UB_SH(frm2, frm2_r, frm2_l);
|
||||
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
|
||||
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
|
||||
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||
acc += 8;
|
||||
ST_SW2(mod2_w, mod3_w, acc, 4);
|
||||
acc += 8;
|
||||
|
||||
LD_SW2(acc, 4, acc0, acc1);
|
||||
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||
|
||||
ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
|
||||
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
|
||||
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
|
||||
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||
|
||||
diff0_r = (mod0_w < cnst16);
|
||||
diff0_l = (mod1_w < cnst16);
|
||||
diff1_r = (mod2_w < cnst16);
|
||||
diff1_l = (mod3_w < cnst16);
|
||||
|
||||
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
mod0_w = diff0_r & mod0_w;
|
||||
mod1_w = diff0_l & mod1_w;
|
||||
mod2_w = diff1_r & mod2_w;
|
||||
mod3_w = diff1_l & mod3_w;
|
||||
|
||||
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||
cnt += 16;
|
||||
UNPCK_UB_SH(frm4, frm2_r, frm2_l);
|
||||
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
|
||||
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
|
||||
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||
acc += 8;
|
||||
ST_SW2(mod2_w, mod3_w, acc, 4);
|
||||
acc += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
|
||||
uint32_t stride,
|
||||
uint8_t *frm2_ptr,
|
||||
int32_t filt_sth,
|
||||
int32_t filt_wgt,
|
||||
uint32_t *acc,
|
||||
uint16_t *cnt) {
|
||||
uint32_t row;
|
||||
v16i8 frm1, frm2, frm3, frm4;
|
||||
v16u8 frm_r, frm_l;
|
||||
v16i8 zero = { 0 };
|
||||
v8u16 frm2_r, frm2_l;
|
||||
v8i16 diff0, diff1, mod0_h, mod1_h;
|
||||
v4i32 cnst3, cnst16, filt_wt, strength;
|
||||
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
|
||||
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
|
||||
v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
|
||||
v4i32 acc0, acc1, acc2, acc3;
|
||||
v8i16 cnt0, cnt1;
|
||||
|
||||
filt_wt = __msa_fill_w(filt_wgt);
|
||||
strength = __msa_fill_w(filt_sth);
|
||||
cnst3 = __msa_ldi_w(3);
|
||||
cnst16 = __msa_ldi_w(16);
|
||||
|
||||
for (row = 8; row--;) {
|
||||
LD_SB2(frm1_ptr, stride, frm1, frm3);
|
||||
frm1_ptr += stride;
|
||||
|
||||
LD_SB2(frm2_ptr, 16, frm2, frm4);
|
||||
frm2_ptr += 16;
|
||||
|
||||
LD_SW2(acc, 4, acc0, acc1);
|
||||
LD_SW2(acc, 4, acc2, acc3);
|
||||
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||
|
||||
ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
|
||||
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
|
||||
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||
|
||||
diff0_r = (mod0_w < cnst16);
|
||||
diff0_l = (mod1_w < cnst16);
|
||||
diff1_r = (mod2_w < cnst16);
|
||||
diff1_l = (mod3_w < cnst16);
|
||||
|
||||
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
mod0_w = diff0_r & mod0_w;
|
||||
mod1_w = diff0_l & mod1_w;
|
||||
mod2_w = diff1_r & mod2_w;
|
||||
mod3_w = diff1_l & mod3_w;
|
||||
|
||||
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||
cnt += 16;
|
||||
|
||||
ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
|
||||
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
|
||||
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
|
||||
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||
acc += 8;
|
||||
ST_SW2(mod2_w, mod3_w, acc, 4);
|
||||
acc += 8;
|
||||
|
||||
LD_SW2(acc, 4, acc0, acc1);
|
||||
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||
|
||||
ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
|
||||
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
|
||||
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||
|
||||
diff0_r = (mod0_w < cnst16);
|
||||
diff0_l = (mod1_w < cnst16);
|
||||
diff1_r = (mod2_w < cnst16);
|
||||
diff1_l = (mod3_w < cnst16);
|
||||
|
||||
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
|
||||
mod0_w = diff0_r & mod0_w;
|
||||
mod1_w = diff0_l & mod1_w;
|
||||
mod2_w = diff1_r & mod2_w;
|
||||
mod3_w = diff1_l & mod3_w;
|
||||
|
||||
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||
cnt += 16;
|
||||
|
||||
ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
|
||||
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
|
||||
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
|
||||
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||
acc += 8;
|
||||
ST_SW2(mod2_w, mod3_w, acc, 4);
|
||||
acc += 8;
|
||||
|
||||
frm1_ptr += stride;
|
||||
frm2_ptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
|
||||
uint8_t *frame2_ptr, uint32_t blk_w,
|
||||
uint32_t blk_h, int32_t strength,
|
||||
int32_t filt_wgt, uint32_t *accu,
|
||||
uint16_t *cnt) {
|
||||
if (8 == (blk_w * blk_h)) {
|
||||
temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
|
||||
strength, filt_wgt, accu, cnt);
|
||||
} else if (16 == (blk_w * blk_h)) {
|
||||
temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
|
||||
strength, filt_wgt, accu, cnt);
|
||||
} else {
|
||||
vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
|
||||
strength, filt_wgt, accu, cnt);
|
||||
}
|
||||
}
|
@ -160,5 +160,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
|
||||
|
||||
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
|
||||
|
Loading…
Reference in New Issue
Block a user