diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc index 7804d34d8..8615ba45a 100644 --- a/test/temporal_filter_test.cc +++ b/test/temporal_filter_test.cc @@ -141,53 +141,60 @@ TEST_P(TemporalFilterTest, SizeCombinations) { count_chk.CopyFrom(count_ref); reference_filter(a, b, width, height, filter_strength, filter_weight, &accum_ref, &count_ref); - filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, - height, filter_strength, filter_weight, - accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()); + ASM_REGISTER_STATE_CHECK( + filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, + height, filter_strength, filter_weight, + accum_chk.TopLeftPixel(), count_chk.TopLeftPixel())); EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); EXPECT_TRUE(count_chk.CheckValues(count_ref)); if (HasFailure()) { printf("Width: %d Height: %d\n", width, height); count_chk.PrintDifference(count_ref); accum_chk.PrintDifference(accum_ref); - ASSERT_TRUE(false); + return; } } } } TEST_P(TemporalFilterTest, CompareReferenceRandom) { - const int width = 16; - const int height = 16; - Buffer a = Buffer(width, height, 8); - // The second buffer must not have any border. - Buffer b = Buffer(width, height, 0); - Buffer accum_ref = Buffer(width, height, 0); - Buffer accum_chk = Buffer(width, height, 0); - Buffer count_ref = Buffer(width, height, 0); - Buffer count_chk = Buffer(width, height, 0); + for (int width = 8; width <= 16; width += 8) { + for (int height = 8; height <= 16; height += 8) { + Buffer a = Buffer(width, height, 8); + // The second buffer must not have any border. + Buffer b = Buffer(width, height, 0); + Buffer accum_ref = Buffer(width, height, 0); + Buffer accum_chk = Buffer(width, height, 0); + Buffer count_ref = Buffer(width, height, 0); + Buffer count_chk = Buffer(width, height, 0); - a.Set(&rnd_, &ACMRandom::Rand8); - b.Set(&rnd_, &ACMRandom::Rand8); + for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) { + for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) { + for (int repeat = 0; repeat < 10; ++repeat) { + a.Set(&rnd_, &ACMRandom::Rand8); + b.Set(&rnd_, &ACMRandom::Rand8); - for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) { - for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) { - accum_ref.Set(rnd_.Rand8()); - accum_chk.CopyFrom(accum_ref); - count_ref.Set(rnd_.Rand8()); - count_chk.CopyFrom(count_ref); - reference_filter(a, b, width, height, filter_strength, filter_weight, - &accum_ref, &count_ref); - filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, - height, filter_strength, filter_weight, - accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()); - EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); - EXPECT_TRUE(count_chk.CheckValues(count_ref)); - if (HasFailure()) { - printf("Weight: %d Strength: %d\n", filter_weight, filter_strength); - count_chk.PrintDifference(count_ref); - accum_chk.PrintDifference(accum_ref); - ASSERT_TRUE(false); + accum_ref.Set(rnd_.Rand8()); + accum_chk.CopyFrom(accum_ref); + count_ref.Set(rnd_.Rand8()); + count_chk.CopyFrom(count_ref); + reference_filter(a, b, width, height, filter_strength, + filter_weight, &accum_ref, &count_ref); + ASM_REGISTER_STATE_CHECK(filter_func_( + a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height, + filter_strength, filter_weight, accum_chk.TopLeftPixel(), + count_chk.TopLeftPixel())); + EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); + EXPECT_TRUE(count_chk.CheckValues(count_ref)); + if (HasFailure()) { + printf("Weight: %d Strength: %d\n", filter_weight, + filter_strength); + count_chk.PrintDifference(count_ref); + accum_chk.PrintDifference(accum_ref); + return; + } + } + } } } } @@ -222,9 +229,8 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) { accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()); } vpx_usec_timer_mark(&timer); - const int elapsed_time = - static_cast(vpx_usec_timer_elapsed(&timer) / 1000); - printf("Temporal filter %dx%d time: %5d ms\n", width, height, + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Temporal filter %dx%d time: %5d us\n", width, height, elapsed_time); } } @@ -233,10 +239,8 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) { INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest, ::testing::Values(&vp9_temporal_filter_apply_c)); -/* TODO(johannkoenig): https://bugs.chromium.org/p/webm/issues/detail?id=1378 #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest, ::testing::Values(&vp9_temporal_filter_apply_sse4_1)); #endif // HAVE_SSE4_1 -*/ } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index bd607228a..47983aeef 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -198,7 +198,7 @@ add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const str specialize qw/vp9_diamond_search_sad avx/; add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse2 msa/; +specialize qw/vp9_temporal_filter_apply sse4_1/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c deleted file mode 100644 index 1ab5f36cc..000000000 --- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -static void temporal_filter_apply_8size_msa(const uint8_t *frm1_ptr, - uint32_t stride, - const uint8_t *frm2_ptr, - int32_t filt_sth, int32_t filt_wgt, - uint32_t *acc, uint16_t *cnt) { - uint32_t row; - uint64_t f0, f1, f2, f3; - v16i8 frm2, frm1 = { 0 }; - v16i8 frm4, frm3 = { 0 }; - v16u8 frm_r, frm_l; - v8i16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 2; row--;) { - LD4(frm1_ptr, stride, f0, f1, f2, f3); - frm1_ptr += (4 * stride); - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 32; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - INSERT_D2_SB(f0, f1, frm1); - INSERT_D2_SB(f2, f3, frm3); - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - UNPCK_UB_SH(frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - UNPCK_UB_SH(frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - } -} - -static void temporal_filter_apply_16size_msa(const uint8_t *frm1_ptr, - uint32_t stride, - const uint8_t *frm2_ptr, - int32_t filt_sth, int32_t filt_wgt, - uint32_t *acc, uint16_t *cnt) { - uint32_t row; - v16i8 frm1, frm2, frm3, frm4; - v16u8 frm_r, frm_l; - v16i8 zero = { 0 }; - v8u16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 8; row--;) { - LD_SB2(frm1_ptr, stride, frm1, frm3); - frm1_ptr += stride; - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 16; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - frm1_ptr += stride; - frm2_ptr += 16; - } -} - -void vp9_temporal_filter_apply_msa(const uint8_t *frame1_ptr, uint32_t stride, - const uint8_t *frame2_ptr, uint32_t blk_w, - uint32_t blk_h, int32_t strength, - int32_t filt_wgt, uint32_t *accu, - uint16_t *cnt) { - if (8 == (blk_w * blk_h)) { - temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else if (16 == (blk_w * blk_h)) { - temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else { - vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, - strength, filt_wgt, accu, cnt); - } -} diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index d3c43faa8..6ca5be090 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include @@ -100,6 +101,12 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, int byte = 0; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + assert(strength >= 0); + assert(strength <= 6); + + assert(filter_weight >= 0); + assert(filter_weight <= 2); + for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { int pixel_value = *frame2; @@ -376,45 +383,44 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply_c( + vp9_highbd_temporal_filter_apply( f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, adj_strength, filter_weight, accumulator, count); - vp9_highbd_temporal_filter_apply_c( + vp9_highbd_temporal_filter_apply( f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, mb_uv_width, mb_uv_height, adj_strength, filter_weight, accumulator + 256, count + 256); - vp9_highbd_temporal_filter_apply_c( + vp9_highbd_temporal_filter_apply( f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, mb_uv_width, mb_uv_height, adj_strength, filter_weight, accumulator + 512, count + 512); } else { // Apply the filter (YUV) - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); - } -#else - // Apply the filter (YUV) - // TODO(jingning): Need SIMD optimization for this. - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 256, count + 256); - vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); + } +#else + // Apply the filter (YUV) + vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 512, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 512, + count + 512); #endif // CONFIG_VP9_HIGHBITDEPTH } } diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 000000000..0419aa170 --- /dev/null +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values +// which may be bound by the edges of the block being filtered. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 + +// Load values from 'a' and 'b'. Compute the difference squared and sum +// neighboring values such that: +// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 +// Values to the left and right of the row are set to 0. +// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. +static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { + const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); + const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); + + const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); + const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); + + // Shift all the values one place to the left/right so we can efficiently sum + // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. + const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); + const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); + + // It becomes necessary to treat the values as unsigned at this point. The + // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point + // forward since the filter is only applied to smooth small pixel changes. + // Once the value has saturated to uint16_t it is well outside the useful + // range. + __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum = sum_u16; +} + +static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, + __m128i *sum_1) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); + const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); + const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); + const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); + const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); + + const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); + const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); + const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); + const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); + + __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); + // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. + __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); + + __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum_0 = sum_u16; + + shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); + shift_right = _mm_srli_si128(diff_sq_1_u16, 2); + + sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum_1 = sum_u16; +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static __m128i average_8(__m128i sum, const __m128i mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, + const __m128i mul_constants_0, + const __m128i mul_constants_1, const int strength, + const int rounding, const int weight) { + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + __m128i input_0, input_1; + + input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); + input_0 = _mm_adds_epu16(input_0, rounding_u16); + + input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); + input_1 = _mm_adds_epu16(input_1, rounding_u16); + + input_0 = _mm_srl_epi16(input_0, strength_u128); + input_1 = _mm_srl_epi16(input_1, strength_u128); + + input_0 = _mm_min_epu16(input_0, sixteen); + input_1 = _mm_min_epu16(input_1, sixteen); + input_0 = _mm_sub_epi16(sixteen, input_0); + input_1 = _mm_sub_epi16(sixteen, input_1); + + *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); + *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, unsigned int *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + unsigned int *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, + const uint8_t *b, unsigned int width, + unsigned int height, int strength, + int weight, unsigned int *accumulator, + uint16_t *count) { + unsigned int h; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + + assert(strength >= 0); + assert(strength <= 6); + + assert(weight >= 0); + assert(weight <= 2); + + assert(width == 8 || width == 16); + + // TODO(johannkoenig) Use uint32_t for accumulator. + assert(sizeof(*accumulator) == sizeof(uint32_t)); + + if (width == 8) { + __m128i sum_row_a, sum_row_b, sum_row_c; + __m128i mul_constants = _mm_setr_epi16( + NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + + sum_8(a, b, &sum_row_a); + sum_8(a + stride, b + width, &sum_row_b); + sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_c, b, count, accumulator); + + a += stride + stride; + b += width; + count += width; + accumulator += width; + + mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); + + for (h = 0; h < height - 2; ++h) { + sum_8(a, b + width, &sum_row_c); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); + sum_row_a = + average_8(sum_row_a, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_a, b, count, accumulator); + + a += stride; + b += width; + count += width; + accumulator += width; + + sum_row_a = sum_row_b; + sum_row_b = sum_row_c; + } + + mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_a, b, count, accumulator); + + } else { // width == 16 + __m128i sum_row_a_0, sum_row_a_1; + __m128i sum_row_b_0, sum_row_b_1; + __m128i sum_row_c_0, sum_row_c_1; + __m128i mul_constants_0 = _mm_setr_epi16( + NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), + mul_constants_1 = _mm_setr_epi16( + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + + sum_16(a, b, &sum_row_a_0, &sum_row_a_1); + sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); + + sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + + average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + + a += stride + stride; + b += width; + count += width; + accumulator += width; + + mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); + mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); + for (h = 0; h < height - 2; ++h) { + sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); + + sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); + sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); + + average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); + + a += stride; + b += width; + count += width; + accumulator += width; + + sum_row_a_0 = sum_row_b_0; + sum_row_a_1 = sum_row_b_1; + sum_row_b_0 = sum_row_c_0; + sum_row_b_1 = sum_row_c_1; + } + + mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); + mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + + average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + } +} diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm deleted file mode 100644 index 21aaa9383..000000000 --- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; void vp9_temporal_filter_apply_sse2 | arg -; (unsigned char *frame1, | 0 -; unsigned int stride, | 1 -; unsigned char *frame2, | 2 -; unsigned int block_width, | 3 -; unsigned int block_height, | 4 -; int strength, | 5 -; int filter_weight, | 6 -; unsigned int *accumulator, | 7 -; unsigned short *count) | 8 -global sym(vp9_temporal_filter_apply_sse2) PRIVATE -sym(vp9_temporal_filter_apply_sse2): - - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ALIGN_STACK 16, rax - %define block_width 0 - %define block_height 16 - %define strength 32 - %define filter_weight 48 - %define rounding_bit 64 - %define rbp_backup 80 - %define stack_size 96 - sub rsp, stack_size - mov [rsp + rbp_backup], rbp - ; end prolog - - mov edx, arg(3) - mov [rsp + block_width], rdx - mov edx, arg(4) - mov [rsp + block_height], rdx - movd xmm6, arg(5) - movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read - - ; calculate the rounding bit outside the loop - ; 0x8000 >> (16 - strength) - mov rdx, 16 - sub rdx, arg(5) ; 16 - strength - movq xmm4, rdx ; can't use rdx w/ shift - movdqa xmm5, [GLOBAL(_const_top_bit)] - psrlw xmm5, xmm4 - movdqa [rsp + rounding_bit], xmm5 - - mov rsi, arg(0) ; src/frame1 - mov rdx, arg(2) ; predictor frame - mov rdi, arg(7) ; accumulator - mov rax, arg(8) ; count - - ; dup the filter weight and store for later - movd xmm0, arg(6) ; filter_weight - pshuflw xmm0, xmm0, 0 - punpcklwd xmm0, xmm0 - movdqa [rsp + filter_weight], xmm0 - - mov rbp, arg(1) ; stride - pxor xmm7, xmm7 ; zero for extraction - - mov rcx, [rsp + block_width] - imul rcx, [rsp + block_height] - add rcx, rdx - cmp dword ptr [rsp + block_width], 8 - jne .temporal_filter_apply_load_16 - -.temporal_filter_apply_load_8: - movq xmm0, [rsi] ; first row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm0, xmm7 ; src[ 0- 7] - movq xmm1, [rsi] ; second row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp .temporal_filter_apply_load_finished - -.temporal_filter_apply_load_16: - movdqa xmm0, [rsi] ; src (frame1) - lea rsi, [rsi + rbp] ; += stride - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; src[ 0- 7] - punpckhbw xmm1, xmm7 ; src[ 8-15] - -.temporal_filter_apply_load_finished: - movdqa xmm2, [rdx] ; predictor (frame2) - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm7 ; pred[ 0- 7] - punpckhbw xmm3, xmm7 ; pred[ 8-15] - - ; modifier = src_byte - pixel_value - psubw xmm0, xmm2 ; src - pred[ 0- 7] - psubw xmm1, xmm3 ; src - pred[ 8-15] - - ; modifier *= modifier - pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 - pmullw xmm1, xmm1 ; modifer[ 8-15]^2 - - ; modifier *= 3 - pmullw xmm0, [GLOBAL(_const_3w)] - pmullw xmm1, [GLOBAL(_const_3w)] - - ; modifer += 0x8000 >> (16 - strength) - paddw xmm0, [rsp + rounding_bit] - paddw xmm1, [rsp + rounding_bit] - - ; modifier >>= strength - psrlw xmm0, [rsp + strength] - psrlw xmm1, [rsp + strength] - - ; modifier = 16 - modifier - ; saturation takes care of modifier > 16 - movdqa xmm3, [GLOBAL(_const_16w)] - movdqa xmm2, [GLOBAL(_const_16w)] - psubusw xmm3, xmm1 - psubusw xmm2, xmm0 - - ; modifier *= filter_weight - pmullw xmm2, [rsp + filter_weight] - pmullw xmm3, [rsp + filter_weight] - - ; count - movdqa xmm4, [rax] - movdqa xmm5, [rax+16] - ; += modifier - paddw xmm4, xmm2 - paddw xmm5, xmm3 - ; write back - movdqa [rax], xmm4 - movdqa [rax+16], xmm5 - lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) - - ; load and extract the predictor up to shorts - pxor xmm7, xmm7 - movdqa xmm0, [rdx] - lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; pred[ 0- 7] - punpckhbw xmm1, xmm7 ; pred[ 8-15] - - ; modifier *= pixel_value - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - ; expand to double words - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm7 ; [ 0- 3] - punpckhwd xmm2, xmm7 ; [ 4- 7] - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm7 ; [ 8-11] - punpckhwd xmm3, xmm7 ; [12-15] - - ; accumulator - movdqa xmm4, [rdi] - movdqa xmm5, [rdi+16] - movdqa xmm6, [rdi+32] - movdqa xmm7, [rdi+48] - ; += modifier - paddd xmm4, xmm0 - paddd xmm5, xmm2 - paddd xmm6, xmm1 - paddd xmm7, xmm3 - ; write back - movdqa [rdi], xmm4 - movdqa [rdi+16], xmm5 - movdqa [rdi+32], xmm6 - movdqa [rdi+48], xmm7 - lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) - - cmp rdx, rcx - je .temporal_filter_apply_epilog - pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_width], 16 - je .temporal_filter_apply_load_16 - jmp .temporal_filter_apply_load_8 - -.temporal_filter_apply_epilog: - ; begin epilog - mov rbp, [rsp + rbp_backup] - add rsp, stack_size - pop rsp - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -_const_3w: - times 8 dw 3 -align 16 -_const_top_bit: - times 8 dw 1<<15 -align 16 -_const_16w - times 8 dw 16 diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index e0913bea3..fae603a91 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -100,7 +100,8 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c + VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) @@ -135,6 +136,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))