vp9 temporal filter: sse4 implementation

Approximates division using multiply and shift.

Speeds up both sizes (8x8 and 16x16) by 30 times.

Fix the call sites to use the RTCD function.

Delete sse2 and mips implementation. They were based on a previous
implementation of the filter. It was changed in Dec 2015:
ece4fd5d22

BUG=webm:1378

Change-Id: I0818e767a802966520b5c6e7999584ad13159276
This commit is contained in:
Johann
2017-03-15 10:40:58 -07:00
parent 58fe1bde59
commit 6dfeea6592
7 changed files with 450 additions and 559 deletions

View File

@@ -141,53 +141,60 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength, filter_weight,
&accum_ref, &count_ref);
filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
height, filter_strength, filter_weight,
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
ASM_REGISTER_STATE_CHECK(
filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
height, filter_strength, filter_weight,
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
printf("Width: %d Height: %d\n", width, height);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
ASSERT_TRUE(false);
return;
}
}
}
}
TEST_P(TemporalFilterTest, CompareReferenceRandom) {
const int width = 16;
const int height = 16;
Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
for (int width = 8; width <= 16; width += 8) {
for (int height = 8; height <= 16; height += 8) {
Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
a.Set(&rnd_, &ACMRandom::Rand8);
b.Set(&rnd_, &ACMRandom::Rand8);
for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
for (int repeat = 0; repeat < 10; ++repeat) {
a.Set(&rnd_, &ACMRandom::Rand8);
b.Set(&rnd_, &ACMRandom::Rand8);
for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
accum_ref.Set(rnd_.Rand8());
accum_chk.CopyFrom(accum_ref);
count_ref.Set(rnd_.Rand8());
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength, filter_weight,
&accum_ref, &count_ref);
filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
height, filter_strength, filter_weight,
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
printf("Weight: %d Strength: %d\n", filter_weight, filter_strength);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
ASSERT_TRUE(false);
accum_ref.Set(rnd_.Rand8());
accum_chk.CopyFrom(accum_ref);
count_ref.Set(rnd_.Rand8());
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength,
filter_weight, &accum_ref, &count_ref);
ASM_REGISTER_STATE_CHECK(filter_func_(
a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
filter_strength, filter_weight, accum_chk.TopLeftPixel(),
count_chk.TopLeftPixel()));
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
printf("Weight: %d Strength: %d\n", filter_weight,
filter_strength);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
return;
}
}
}
}
}
}
@@ -222,9 +229,8 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
}
vpx_usec_timer_mark(&timer);
const int elapsed_time =
static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
printf("Temporal filter %dx%d time: %5d ms\n", width, height,
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("Temporal filter %dx%d time: %5d us\n", width, height,
elapsed_time);
}
}
@@ -233,10 +239,8 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
::testing::Values(&vp9_temporal_filter_apply_c));
/* TODO(johannkoenig): https://bugs.chromium.org/p/webm/issues/detail?id=1378
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
::testing::Values(&vp9_temporal_filter_apply_sse4_1));
#endif // HAVE_SSE4_1
*/
} // namespace

View File

@@ -198,7 +198,7 @@ add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const str
specialize qw/vp9_diamond_search_sad avx/;
add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse2 msa/;
specialize qw/vp9_temporal_filter_apply sse4_1/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

View File

@@ -1,285 +0,0 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
static void temporal_filter_apply_8size_msa(const uint8_t *frm1_ptr,
uint32_t stride,
const uint8_t *frm2_ptr,
int32_t filt_sth, int32_t filt_wgt,
uint32_t *acc, uint16_t *cnt) {
uint32_t row;
uint64_t f0, f1, f2, f3;
v16i8 frm2, frm1 = { 0 };
v16i8 frm4, frm3 = { 0 };
v16u8 frm_r, frm_l;
v8i16 frm2_r, frm2_l;
v8i16 diff0, diff1, mod0_h, mod1_h;
v4i32 cnst3, cnst16, filt_wt, strength;
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
v4i32 acc0, acc1, acc2, acc3;
v8i16 cnt0, cnt1;
filt_wt = __msa_fill_w(filt_wgt);
strength = __msa_fill_w(filt_sth);
cnst3 = __msa_ldi_w(3);
cnst16 = __msa_ldi_w(16);
for (row = 2; row--;) {
LD4(frm1_ptr, stride, f0, f1, f2, f3);
frm1_ptr += (4 * stride);
LD_SB2(frm2_ptr, 16, frm2, frm4);
frm2_ptr += 32;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
INSERT_D2_SB(f0, f1, frm1);
INSERT_D2_SB(f2, f3, frm3);
ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frm2, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frm4, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
}
}
static void temporal_filter_apply_16size_msa(const uint8_t *frm1_ptr,
uint32_t stride,
const uint8_t *frm2_ptr,
int32_t filt_sth, int32_t filt_wgt,
uint32_t *acc, uint16_t *cnt) {
uint32_t row;
v16i8 frm1, frm2, frm3, frm4;
v16u8 frm_r, frm_l;
v16i8 zero = { 0 };
v8u16 frm2_r, frm2_l;
v8i16 diff0, diff1, mod0_h, mod1_h;
v4i32 cnst3, cnst16, filt_wt, strength;
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
v4i32 acc0, acc1, acc2, acc3;
v8i16 cnt0, cnt1;
filt_wt = __msa_fill_w(filt_wgt);
strength = __msa_fill_w(filt_sth);
cnst3 = __msa_ldi_w(3);
cnst16 = __msa_ldi_w(16);
for (row = 8; row--;) {
LD_SB2(frm1_ptr, stride, frm1, frm3);
frm1_ptr += stride;
LD_SB2(frm2_ptr, 16, frm2, frm4);
frm2_ptr += 16;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
frm1_ptr += stride;
frm2_ptr += 16;
}
}
void vp9_temporal_filter_apply_msa(const uint8_t *frame1_ptr, uint32_t stride,
const uint8_t *frame2_ptr, uint32_t blk_w,
uint32_t blk_h, int32_t strength,
int32_t filt_wgt, uint32_t *accu,
uint16_t *cnt) {
if (8 == (blk_w * blk_h)) {
temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
filt_wgt, accu, cnt);
} else if (16 == (blk_w * blk_h)) {
temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
filt_wgt, accu, cnt);
} else {
vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
strength, filt_wgt, accu, cnt);
}
}

View File

@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <math.h>
#include <limits.h>
@@ -100,6 +101,12 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
int byte = 0;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
assert(strength >= 0);
assert(strength <= 6);
assert(filter_weight >= 0);
assert(filter_weight <= 2);
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
int pixel_value = *frame2;
@@ -376,45 +383,44 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int adj_strength = strength + 2 * (mbd->bd - 8);
// Apply the filter (YUV)
vp9_highbd_temporal_filter_apply_c(
vp9_highbd_temporal_filter_apply(
f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
adj_strength, filter_weight, accumulator, count);
vp9_highbd_temporal_filter_apply_c(
vp9_highbd_temporal_filter_apply(
f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 256, count + 256);
vp9_highbd_temporal_filter_apply_c(
vp9_highbd_temporal_filter_apply(
f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 512, count + 512);
} else {
// Apply the filter (YUV)
vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength,
filter_weight, accumulator, count);
vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width,
mb_uv_height, strength, filter_weight,
accumulator + 256, count + 256);
vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width,
mb_uv_height, strength, filter_weight,
accumulator + 512, count + 512);
}
#else
// Apply the filter (YUV)
// TODO(jingning): Need SIMD optimization for this.
vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength, filter_weight,
accumulator, count);
vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 256,
count + 256);
vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 512,
count + 512);
}
#else
// Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength, filter_weight,
accumulator, count);
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 256,
count + 256);
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 512,
count + 512);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}

View File

@@ -0,0 +1,378 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <smmintrin.h>
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
// Division using multiplication and shifting. The C implementation does:
// modifier *= 3;
// modifier /= index;
// where 'modifier' is a set of summed values and 'index' is the number of
// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
// which may be bound by the edges of the block being filtered.
//
// This equation works out to (m * 3) / i which reduces to:
// m * 3/4
// m * 1/2
// m * 1/3
//
// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
// m * C / 65536
// we can create a C to replicate the division.
//
// m * 49152 / 65536 = m * 3/4
// m * 32758 / 65536 = m * 1/2
// m * 21846 / 65536 = m * 0.3333
//
// These are loaded using an instruction expecting int16_t values but are used
// with _mm_mulhi_epu16(), which treats them as unsigned.
#define NEIGHBOR_CONSTANT_4 (int16_t)49152
#define NEIGHBOR_CONSTANT_6 (int16_t)32768
#define NEIGHBOR_CONSTANT_9 (int16_t)21846
// Load values from 'a' and 'b'. Compute the difference squared and sum
// neighboring values such that:
// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
// Values to the left and right of the row are set to 0.
// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
// Shift all the values one place to the left/right so we can efficiently sum
// diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
// It becomes necessary to treat the values as unsigned at this point. The
// 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
// forward since the filter is only applied to smooth small pixel changes.
// Once the value has saturated to uint16_t it is well outside the useful
// range.
__m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum = sum_u16;
}
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
__m128i *sum_1) {
const __m128i zero = _mm_setzero_si128();
const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
__m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
// Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
__m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
__m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum_0 = sum_u16;
shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum_1 = sum_u16;
}
// Average the value based on the number of values summed (9 for pixels away
// from the border, 4 for pixels in corners, and 6 for other edge values).
//
// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
// by weight.
static __m128i average_8(__m128i sum, const __m128i mul_constants,
const int strength, const int rounding,
const int weight) {
// _mm_srl_epi16 uses the lower 64 bit value for the shift.
const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
const __m128i rounding_u16 = _mm_set1_epi16(rounding);
const __m128i weight_u16 = _mm_set1_epi16(weight);
const __m128i sixteen = _mm_set1_epi16(16);
// modifier * 3 / index;
sum = _mm_mulhi_epu16(sum, mul_constants);
sum = _mm_adds_epu16(sum, rounding_u16);
sum = _mm_srl_epi16(sum, strength_u128);
// The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
// >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
// So this needs to use the epu16 version which did not come until SSE4.
sum = _mm_min_epu16(sum, sixteen);
sum = _mm_sub_epi16(sixteen, sum);
return _mm_mullo_epi16(sum, weight_u16);
}
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
const __m128i mul_constants_0,
const __m128i mul_constants_1, const int strength,
const int rounding, const int weight) {
const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
const __m128i rounding_u16 = _mm_set1_epi16(rounding);
const __m128i weight_u16 = _mm_set1_epi16(weight);
const __m128i sixteen = _mm_set1_epi16(16);
__m128i input_0, input_1;
input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
input_0 = _mm_adds_epu16(input_0, rounding_u16);
input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
input_1 = _mm_adds_epu16(input_1, rounding_u16);
input_0 = _mm_srl_epi16(input_0, strength_u128);
input_1 = _mm_srl_epi16(input_1, strength_u128);
input_0 = _mm_min_epu16(input_0, sixteen);
input_1 = _mm_min_epu16(input_1, sixteen);
input_0 = _mm_sub_epi16(sixteen, input_0);
input_1 = _mm_sub_epi16(sixteen, input_1);
*sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
*sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
}
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
uint16_t *count, unsigned int *accumulator) {
const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
const __m128i zero = _mm_setzero_si128();
__m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
__m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
__m128i pred_0_u32, pred_1_u32;
__m128i accum_0_u32, accum_1_u32;
count_u16 = _mm_adds_epu16(count_u16, sum_u16);
_mm_storeu_si128((__m128i *)count, count_u16);
pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
_mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
}
static void accumulate_and_store_16(const __m128i sum_0_u16,
const __m128i sum_1_u16,
const uint8_t *pred, uint16_t *count,
unsigned int *accumulator) {
const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
const __m128i zero = _mm_setzero_si128();
__m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
__m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
__m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
__m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
_mm_storeu_si128((__m128i *)count, count_0_u16);
count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
_mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
_mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
_mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
_mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
}
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
const uint8_t *b, unsigned int width,
unsigned int height, int strength,
int weight, unsigned int *accumulator,
uint16_t *count) {
unsigned int h;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
assert(strength >= 0);
assert(strength <= 6);
assert(weight >= 0);
assert(weight <= 2);
assert(width == 8 || width == 16);
// TODO(johannkoenig) Use uint32_t for accumulator.
assert(sizeof(*accumulator) == sizeof(uint32_t));
if (width == 8) {
__m128i sum_row_a, sum_row_b, sum_row_c;
__m128i mul_constants = _mm_setr_epi16(
NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_8(a, b, &sum_row_a);
sum_8(a + stride, b + width, &sum_row_b);
sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_c, b, count, accumulator);
a += stride + stride;
b += width;
count += width;
accumulator += width;
mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
for (h = 0; h < height - 2; ++h) {
sum_8(a, b + width, &sum_row_c);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
sum_row_a =
average_8(sum_row_a, mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_a, b, count, accumulator);
a += stride;
b += width;
count += width;
accumulator += width;
sum_row_a = sum_row_b;
sum_row_b = sum_row_c;
}
mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_a, b, count, accumulator);
} else { // width == 16
__m128i sum_row_a_0, sum_row_a_1;
__m128i sum_row_b_0, sum_row_b_1;
__m128i sum_row_c_0, sum_row_c_1;
__m128i mul_constants_0 = _mm_setr_epi16(
NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
mul_constants_1 = _mm_setr_epi16(
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
a += stride + stride;
b += width;
count += width;
accumulator += width;
mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
for (h = 0; h < height - 2; ++h) {
sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
a += stride;
b += width;
count += width;
accumulator += width;
sum_row_a_0 = sum_row_b_0;
sum_row_a_1 = sum_row_b_1;
sum_row_b_0 = sum_row_c_0;
sum_row_b_1 = sum_row_c_1;
}
mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
}
}

View File

@@ -1,212 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
; void vp9_temporal_filter_apply_sse2 | arg
; (unsigned char *frame1, | 0
; unsigned int stride, | 1
; unsigned char *frame2, | 2
; unsigned int block_width, | 3
; unsigned int block_height, | 4
; int strength, | 5
; int filter_weight, | 6
; unsigned int *accumulator, | 7
; unsigned short *count) | 8
global sym(vp9_temporal_filter_apply_sse2) PRIVATE
sym(vp9_temporal_filter_apply_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
ALIGN_STACK 16, rax
%define block_width 0
%define block_height 16
%define strength 32
%define filter_weight 48
%define rounding_bit 64
%define rbp_backup 80
%define stack_size 96
sub rsp, stack_size
mov [rsp + rbp_backup], rbp
; end prolog
mov edx, arg(3)
mov [rsp + block_width], rdx
mov edx, arg(4)
mov [rsp + block_height], rdx
movd xmm6, arg(5)
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
; calculate the rounding bit outside the loop
; 0x8000 >> (16 - strength)
mov rdx, 16
sub rdx, arg(5) ; 16 - strength
movq xmm4, rdx ; can't use rdx w/ shift
movdqa xmm5, [GLOBAL(_const_top_bit)]
psrlw xmm5, xmm4
movdqa [rsp + rounding_bit], xmm5
mov rsi, arg(0) ; src/frame1
mov rdx, arg(2) ; predictor frame
mov rdi, arg(7) ; accumulator
mov rax, arg(8) ; count
; dup the filter weight and store for later
movd xmm0, arg(6) ; filter_weight
pshuflw xmm0, xmm0, 0
punpcklwd xmm0, xmm0
movdqa [rsp + filter_weight], xmm0
mov rbp, arg(1) ; stride
pxor xmm7, xmm7 ; zero for extraction
mov rcx, [rsp + block_width]
imul rcx, [rsp + block_height]
add rcx, rdx
cmp dword ptr [rsp + block_width], 8
jne .temporal_filter_apply_load_16
.temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm0, xmm7 ; src[ 0- 7]
movq xmm1, [rsi] ; second row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm1, xmm7 ; src[ 8-15]
jmp .temporal_filter_apply_load_finished
.temporal_filter_apply_load_16:
movdqa xmm0, [rsi] ; src (frame1)
lea rsi, [rsi + rbp] ; += stride
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; src[ 0- 7]
punpckhbw xmm1, xmm7 ; src[ 8-15]
.temporal_filter_apply_load_finished:
movdqa xmm2, [rdx] ; predictor (frame2)
movdqa xmm3, xmm2
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
punpckhbw xmm3, xmm7 ; pred[ 8-15]
; modifier = src_byte - pixel_value
psubw xmm0, xmm2 ; src - pred[ 0- 7]
psubw xmm1, xmm3 ; src - pred[ 8-15]
; modifier *= modifier
pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
pmullw xmm1, xmm1 ; modifer[ 8-15]^2
; modifier *= 3
pmullw xmm0, [GLOBAL(_const_3w)]
pmullw xmm1, [GLOBAL(_const_3w)]
; modifer += 0x8000 >> (16 - strength)
paddw xmm0, [rsp + rounding_bit]
paddw xmm1, [rsp + rounding_bit]
; modifier >>= strength
psrlw xmm0, [rsp + strength]
psrlw xmm1, [rsp + strength]
; modifier = 16 - modifier
; saturation takes care of modifier > 16
movdqa xmm3, [GLOBAL(_const_16w)]
movdqa xmm2, [GLOBAL(_const_16w)]
psubusw xmm3, xmm1
psubusw xmm2, xmm0
; modifier *= filter_weight
pmullw xmm2, [rsp + filter_weight]
pmullw xmm3, [rsp + filter_weight]
; count
movdqa xmm4, [rax]
movdqa xmm5, [rax+16]
; += modifier
paddw xmm4, xmm2
paddw xmm5, xmm3
; write back
movdqa [rax], xmm4
movdqa [rax+16], xmm5
lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
; load and extract the predictor up to shorts
pxor xmm7, xmm7
movdqa xmm0, [rdx]
lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; pred[ 0- 7]
punpckhbw xmm1, xmm7 ; pred[ 8-15]
; modifier *= pixel_value
pmullw xmm0, xmm2
pmullw xmm1, xmm3
; expand to double words
movdqa xmm2, xmm0
punpcklwd xmm0, xmm7 ; [ 0- 3]
punpckhwd xmm2, xmm7 ; [ 4- 7]
movdqa xmm3, xmm1
punpcklwd xmm1, xmm7 ; [ 8-11]
punpckhwd xmm3, xmm7 ; [12-15]
; accumulator
movdqa xmm4, [rdi]
movdqa xmm5, [rdi+16]
movdqa xmm6, [rdi+32]
movdqa xmm7, [rdi+48]
; += modifier
paddd xmm4, xmm0
paddd xmm5, xmm2
paddd xmm6, xmm1
paddd xmm7, xmm3
; write back
movdqa [rdi], xmm4
movdqa [rdi+16], xmm5
movdqa [rdi+32], xmm6
movdqa [rdi+48], xmm7
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
cmp rdx, rcx
je .temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction
cmp dword ptr [rsp + block_width], 16
je .temporal_filter_apply_load_16
jmp .temporal_filter_apply_load_8
.temporal_filter_apply_epilog:
; begin epilog
mov rbp, [rsp + rbp_backup]
add rsp, stack_size
pop rsp
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
_const_3w:
times 8 dw 3
align 16
_const_top_bit:
times 8 dw 1<<15
align 16
_const_16w
times 8 dw 16

View File

@@ -100,7 +100,8 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -135,6 +136,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))