deblock filter : moved from vp8 code branch

The deblocking filters used in vp8 have been moved to vpx_dsp for
use by both vp8 and vp9.

Change-Id: I5209d76edafc894b550f751fc76d3aa6799b392d
This commit is contained in:
Jim Bankoski 2016-07-08 10:06:54 -07:00
parent 45ed7effed
commit 88e6951465
19 changed files with 1015 additions and 2270 deletions

View File

@ -11,7 +11,7 @@
#include "test/register_state_check.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
@ -25,7 +25,7 @@ typedef void (*PostProcFunc)(unsigned char *src_ptr,
namespace {
class VP8PostProcessingFilterTest
class VPxPostProcessingFilterTest
: public ::testing::TestWithParam<PostProcFunc> {
public:
virtual void TearDown() {
@ -33,10 +33,10 @@ class VP8PostProcessingFilterTest
}
};
// Test routine for the VP8 post-processing function
// vp8_post_proc_down_and_across_mb_row_c.
// Test routine for the VPx post-processing function
// vpx_post_proc_down_and_across_mb_row_c.
TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
// Size of the underlying data block that will be filtered.
const int block_width = 16;
const int block_height = 16;
@ -92,7 +92,7 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
for (int i = 0; i < block_height; ++i) {
for (int j = 0; j < block_width; ++j) {
EXPECT_EQ(expected_data[i], pixel_ptr[j])
<< "VP8PostProcessingFilterTest failed with invalid filter output";
<< "VPxPostProcessingFilterTest failed with invalid filter output";
}
pixel_ptr += output_stride;
}
@ -102,17 +102,17 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
vpx_free(flimits);
};
INSTANTIATE_TEST_CASE_P(C, VP8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
INSTANTIATE_TEST_CASE_P(C, VPxPostProcessingFilterTest,
::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, VP8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
INSTANTIATE_TEST_CASE_P(SSE2, VPxPostProcessingFilterTest,
::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
#endif
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mb_row_msa));
INSTANTIATE_TEST_CASE_P(MSA, VPxPostProcessingFilterTest,
::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
#endif
} // namespace

View File

@ -1,801 +0,0 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/mips/msa/vp8_macros_msa.h"
static const int16_t vp8_rv_msa[] =
{
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
#define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, \
out4, out5, out6, out7, \
out8, out9, out10, out11, \
out12, out13, out14, out15) \
{ \
v8i16 temp0, temp1, temp2, temp3, temp4; \
v8i16 temp5, temp6, temp7, temp8, temp9; \
\
ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
temp0, temp1, temp2, temp3); \
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
temp0, temp1, temp2, temp3); \
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_UB(temp5, temp4, out8, out10); \
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_UB(temp5, temp4, out12, out14); \
out0 = (v16u8)temp6; \
out2 = (v16u8)temp7; \
out4 = (v16u8)temp8; \
out6 = (v16u8)temp9; \
out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
}
#define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
below1_in, below2_in, ref, out) \
{ \
v16u8 temp0, temp1; \
\
temp1 = __msa_aver_u_b(above2_in, above1_in); \
temp0 = __msa_aver_u_b(below2_in, below1_in); \
temp1 = __msa_aver_u_b(temp1, temp0); \
out = __msa_aver_u_b(src_in, temp1); \
temp0 = __msa_asub_u_b(src_in, above2_in); \
temp1 = __msa_asub_u_b(src_in, above1_in); \
temp0 = (temp0 < ref); \
temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \
temp1 = __msa_asub_u_b(src_in, below1_in); \
temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \
temp1 = __msa_asub_u_b(src_in, below2_in); \
temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \
out = __msa_bmz_v(out, src_in, temp0); \
}
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15) \
{ \
v8i16 temp0, temp1, temp2, temp3, temp4; \
v8i16 temp5, temp6, temp7, temp8, temp9; \
\
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
temp2, temp3, temp4, temp5); \
ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
temp6, temp7, temp8, temp9); \
ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
}
#define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
in6, in7, in8, in9, in10, in11) \
{ \
v8i16 temp0, temp1, temp2, temp3; \
v8i16 temp4, temp5, temp6, temp7; \
\
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
temp4 = __msa_ilvr_h(temp5, temp4); \
ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
temp5 = __msa_ilvr_h(temp7, temp6); \
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
in0 = (v16u8)temp0; \
in2 = (v16u8)temp1; \
in4 = (v16u8)temp2; \
in6 = (v16u8)temp3; \
in8 = (v16u8)temp6; \
in10 = (v16u8)temp7; \
in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
}
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
int32_t src_stride,
int32_t dst_stride,
int32_t cols, uint8_t *f)
{
uint8_t *p_src = src_ptr;
uint8_t *p_dst = dst_ptr;
uint8_t *f_orig = f;
uint8_t *p_dst_st = dst_ptr;
uint16_t col;
uint64_t out0, out1, out2, out3;
v16u8 above2, above1, below2, below1, src, ref, ref_temp;
v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
for (col = (cols / 16); col--;)
{
ref = LD_UB(f);
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
src = LD_UB(p_src);
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
above2 = LD_UB(p_src + 3 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
above1 = LD_UB(p_src + 4 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
src = LD_UB(p_src + 5 * src_stride);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
below1 = LD_UB(p_src + 6 * src_stride);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
below2 = LD_UB(p_src + 7 * src_stride);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
above2 = LD_UB(p_src + 8 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
p_dst, dst_stride);
p_dst += 16;
p_src += 16;
f += 16;
}
if (0 != (cols / 16))
{
ref = LD_UB(f);
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
src = LD_UB(p_src);
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
above2 = LD_UB(p_src + 3 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
above1 = LD_UB(p_src + 4 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
src = LD_UB(p_src + 5 * src_stride);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
below1 = LD_UB(p_src + 6 * src_stride);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
below2 = LD_UB(p_src + 7 * src_stride);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
above2 = LD_UB(p_src + 8 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
out0 = __msa_copy_u_d((v2i64)inter0, 0);
out1 = __msa_copy_u_d((v2i64)inter1, 0);
out2 = __msa_copy_u_d((v2i64)inter2, 0);
out3 = __msa_copy_u_d((v2i64)inter3, 0);
SD4(out0, out1, out2, out3, p_dst, dst_stride);
out0 = __msa_copy_u_d((v2i64)inter4, 0);
out1 = __msa_copy_u_d((v2i64)inter5, 0);
out2 = __msa_copy_u_d((v2i64)inter6, 0);
out3 = __msa_copy_u_d((v2i64)inter7, 0);
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
}
f = f_orig;
p_dst = dst_ptr - 2;
LD_UB8(p_dst, dst_stride,
inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
for (col = 0; col < (cols / 8); ++col)
{
ref = LD_UB(f);
f += 8;
VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3,
inter4, inter5, inter6, inter7,
inter8, inter9, inter10, inter11);
if (0 == col)
{
above2 = inter2;
above1 = inter2;
}
else
{
above2 = inter0;
above1 = inter1;
}
src = inter2;
below1 = inter3;
below2 = inter4;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
ref_temp, inter2);
above2 = inter5;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
ref_temp, inter3);
above1 = inter6;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
ref_temp, inter4);
src = inter7;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
ref_temp, inter5);
below1 = inter8;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
ref_temp, inter6);
below2 = inter9;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
ref_temp, inter7);
if (col == (cols / 8 - 1))
{
above2 = inter9;
}
else
{
above2 = inter10;
}
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
ref_temp, inter8);
if (col == (cols / 8 - 1))
{
above1 = inter9;
}
else
{
above1 = inter11;
}
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
ref_temp, inter9);
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
inter8, inter9, inter2, inter3, inter4, inter5,
inter6, inter7, inter8, inter9);
p_dst += 8;
LD_UB2(p_dst, dst_stride, inter0, inter1);
ST8x1_UB(inter2, p_dst_st);
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
p_dst_st += 8;
}
}
static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
int32_t src_stride,
int32_t dst_stride,
int32_t cols, uint8_t *f)
{
uint8_t *p_src = src_ptr;
uint8_t *p_dst = dst_ptr;
uint8_t *p_dst_st = dst_ptr;
uint8_t *f_orig = f;
uint16_t col;
v16u8 above2, above1, below2, below1;
v16u8 src, ref, ref_temp;
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
v16u8 inter7, inter8, inter9, inter10, inter11;
v16u8 inter12, inter13, inter14, inter15;
for (col = (cols / 16); col--;)
{
ref = LD_UB(f);
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
src = LD_UB(p_src);
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
above2 = LD_UB(p_src + 3 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
above1 = LD_UB(p_src + 4 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
src = LD_UB(p_src + 5 * src_stride);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
below1 = LD_UB(p_src + 6 * src_stride);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
below2 = LD_UB(p_src + 7 * src_stride);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
above2 = LD_UB(p_src + 8 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
src = LD_UB(p_src + 10 * src_stride);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
below1 = LD_UB(p_src + 11 * src_stride);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
below2 = LD_UB(p_src + 12 * src_stride);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
above2 = LD_UB(p_src + 13 * src_stride);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
above1 = LD_UB(p_src + 14 * src_stride);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
src = LD_UB(p_src + 15 * src_stride);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
below1 = LD_UB(p_src + 16 * src_stride);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
below2 = LD_UB(p_src + 17 * src_stride);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
p_dst, dst_stride);
ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13,
inter14, inter15, p_dst + 8 * dst_stride, dst_stride);
p_src += 16;
p_dst += 16;
f += 16;
}
f = f_orig;
p_dst = dst_ptr - 2;
LD_UB8(p_dst, dst_stride,
inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
LD_UB8(p_dst + 8 * dst_stride, dst_stride,
inter8, inter9, inter10, inter11, inter12, inter13,
inter14, inter15);
for (col = 0; col < cols / 8; ++col)
{
ref = LD_UB(f);
f += 8;
TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5,
inter6, inter7, inter8, inter9, inter10, inter11,
inter12, inter13, inter14, inter15);
if (0 == col)
{
above2 = inter2;
above1 = inter2;
}
else
{
above2 = inter0;
above1 = inter1;
}
src = inter2;
below1 = inter3;
below2 = inter4;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
ref_temp, inter2);
above2 = inter5;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
ref_temp, inter3);
above1 = inter6;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
ref_temp, inter4);
src = inter7;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
ref_temp, inter5);
below1 = inter8;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
ref_temp, inter6);
below2 = inter9;
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
ref_temp, inter7);
if (col == (cols / 8 - 1))
{
above2 = inter9;
}
else
{
above2 = inter10;
}
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
ref_temp, inter8);
if (col == (cols / 8 - 1))
{
above1 = inter9;
}
else
{
above1 = inter11;
}
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
ref_temp, inter9);
VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5,
inter6, inter7, inter8, inter9,
inter2, inter3, inter4, inter5,
inter6, inter7, inter8, inter9,
inter10, inter11, inter12, inter13,
inter14, inter15, above2, above1);
p_dst += 8;
LD_UB2(p_dst, dst_stride, inter0, inter1);
ST8x1_UB(inter2, p_dst_st);
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
p_dst_st += 8;
}
}
void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
int32_t src_stride,
int32_t dst_stride,
int32_t cols, uint8_t *f,
int32_t size)
{
if (8 == size)
{
postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride,
cols, f);
}
else if (16 == size)
{
postproc_down_across_luma_msa(src, dst, src_stride, dst_stride,
cols, f);
}
}
void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
int32_t rows, int32_t cols, int32_t flimit)
{
int32_t row, col, cnt;
uint8_t *src_dup = src_ptr;
v16u8 src0, src, tmp_orig;
v16u8 tmp = { 0 };
v16i8 zero = { 0 };
v8u16 sum_h, src_r_h, src_l_h;
v4u32 src_r_w, src_l_w;
v4i32 flimit_vec;
flimit_vec = __msa_fill_w(flimit);
for (row = rows; row--;)
{
int32_t sum_sq = 0;
int32_t sum = 0;
src0 = (v16u8)__msa_fill_b(src_dup[0]);
ST8x1_UB(src0, (src_dup - 8));
src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
ST_UB(src0, src_dup + cols);
src_dup[cols + 16] = src_dup[cols - 1];
tmp_orig = (v16u8)__msa_ldi_b(0);
tmp_orig[15] = tmp[15];
src = LD_UB(src_dup - 8);
src[15] = 0;
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
sum_sq = HADD_SW_S32(src_r_w);
sum_sq += HADD_SW_S32(src_l_w);
sum_h = __msa_hadd_u_h(src, src);
sum = HADD_UH_U32(sum_h);
{
v16u8 src7, src8, src_r, src_l;
v16i8 mask;
v8u16 add_r, add_l;
v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
v4i32 sub0, sub1, sub2, sub3;
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
v4i32 mul0, mul1, mul2, mul3;
v4i32 total0, total1, total2, total3;
v8i16 const8 = __msa_fill_h(8);
src7 = LD_UB(src_dup + 7);
src8 = LD_UB(src_dup - 8);
for (col = 0; col < (cols >> 4); ++col)
{
ILVRL_B2_UB(src7, src8, src_r, src_l);
HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
sum_r[0] = sum + sub_r[0];
for (cnt = 0; cnt < 7; ++cnt)
{
sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
}
sum_l[0] = sum_r[7] + sub_l[0];
for (cnt = 0; cnt < 7; ++cnt)
{
sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
}
sum = sum_l[7];
src = LD_UB(src_dup + 16 * col);
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
HADD_UB2_UH(src_r, src_l, add_r, add_l);
UNPCK_SH_SW(sub_r, sub0, sub1);
UNPCK_SH_SW(sub_l, sub2, sub3);
ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3,
mul0, mul1, mul2, mul3);
sum_sq0[0] = sum_sq + mul0[0];
for (cnt = 0; cnt < 3; ++cnt)
{
sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
}
sum_sq1[0] = sum_sq0[3] + mul1[0];
for (cnt = 0; cnt < 3; ++cnt)
{
sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
}
sum_sq2[0] = sum_sq1[3] + mul2[0];
for (cnt = 0; cnt < 3; ++cnt)
{
sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
}
sum_sq3[0] = sum_sq2[3] + mul3[0];
for (cnt = 0; cnt < 3; ++cnt)
{
sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
}
sum_sq = sum_sq3[3];
UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
total0 = sum_sq0 * __msa_ldi_w(15);
total0 -= sum0_w * sum0_w;
total1 = sum_sq1 * __msa_ldi_w(15);
total1 -= sum1_w * sum1_w;
total2 = sum_sq2 * __msa_ldi_w(15);
total2 -= sum2_w * sum2_w;
total3 = sum_sq3 * __msa_ldi_w(15);
total3 -= sum3_w * sum3_w;
total0 = (total0 < flimit_vec);
total1 = (total1 < flimit_vec);
total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
if (col == 0)
{
uint64_t src_d;
src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
SD(src_d, (src_dup - 8));
}
src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
ST_UB(tmp, (src_dup + (16 * col)));
}
src_dup += pitch;
}
}
}
void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
int32_t cols, int32_t flimit)
{
int32_t row, col, cnt, i;
const int16_t *rv3 = &vp8_rv_msa[63 & rand()];
v4i32 flimit_vec;
v16u8 dst7, dst8, dst_r_b, dst_l_b;
v16i8 mask;
v8u16 add_r, add_l;
v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
flimit_vec = __msa_fill_w(flimit);
for (col = 0; col < (cols >> 4); ++col)
{
uint8_t *dst_tmp = &dst_ptr[col << 4];
v16u8 dst;
v16i8 zero = { 0 };
v16u8 tmp[16];
v8i16 mult0, mult1, rv2_0, rv2_1;
v8i16 sum0_h = { 0 };
v8i16 sum1_h = { 0 };
v4i32 mul0 = { 0 };
v4i32 mul1 = { 0 };
v4i32 mul2 = { 0 };
v4i32 mul3 = { 0 };
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
v4i32 add0, add1, add2, add3;
const int16_t *rv2[16];
dst = LD_UB(dst_tmp);
for (cnt = (col << 4), i = 0; i < 16; ++cnt)
{
rv2[i] = rv3 + ((cnt * 17) & 127);
++i;
}
for (cnt = -8; cnt < 0; ++cnt)
{
ST_UB(dst, dst_tmp + cnt * pitch);
}
dst = LD_UB((dst_tmp + (rows - 1) * pitch));
for (cnt = rows; cnt < rows + 17; ++cnt)
{
ST_UB(dst, dst_tmp + cnt * pitch);
}
for (cnt = -8; cnt <= 6; ++cnt)
{
dst = LD_UB(dst_tmp + (cnt * pitch));
UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
}
for (row = 0; row < (rows + 8); ++row)
{
for (i = 0; i < 8; ++i)
{
rv2_0[i] = *(rv2[i] + (row & 127));
rv2_1[i] = *(rv2[i + 8] + (row & 127));
}
dst7 = LD_UB(dst_tmp + (7 * pitch));
dst8 = LD_UB(dst_tmp - (8 * pitch));
ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
UNPCK_SH_SW(sub_r, sub0, sub1);
UNPCK_SH_SW(sub_l, sub2, sub3);
sum0_h += sub_r;
sum1_h += sub_l;
HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
ILVRL_H2_SW(zero, add_r, add0, add1);
ILVRL_H2_SW(zero, add_l, add2, add3);
mul0 += add0 * sub0;
mul1 += add1 * sub1;
mul2 += add2 * sub2;
mul3 += add3 * sub3;
dst = LD_UB(dst_tmp);
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
total0 = mul0 * __msa_ldi_w(15);
total0 -= sum0_w * sum0_w;
total1 = mul1 * __msa_ldi_w(15);
total1 -= sum1_w * sum1_w;
total2 = mul2 * __msa_ldi_w(15);
total2 -= sum2_w * sum2_w;
total3 = mul3 * __msa_ldi_w(15);
total3 -= sum3_w * sum3_w;
total0 = (total0 < flimit_vec);
total1 = (total1 < flimit_vec);
total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
if (row >= 8)
{
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
}
dst_tmp += pitch;
}
}
}

View File

@ -72,142 +72,11 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
};
#endif
const short vp8_rv[] =
{
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
/***********************************************************************************************************
*/
void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int cols,
unsigned char *f,
int size
)
{
unsigned char *p_src, *p_dst;
int row;
int col;
unsigned char v;
unsigned char d[4];
for (row = 0; row < size; row++)
{
/* post_proc_down for one row */
p_src = src_ptr;
p_dst = dst_ptr;
for (col = 0; col < cols; col++)
{
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
unsigned char p_above1 = p_src[col - src_pixels_per_line];
unsigned char p_below1 = p_src[col + src_pixels_per_line];
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
v = p_src[col];
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
{
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
p_dst[col] = v;
}
/* now post_proc_across */
p_src = dst_ptr;
p_dst = dst_ptr;
p_src[-2] = p_src[-1] = p_src[0];
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++)
{
v = p_src[col];
if ((abs(v - p_src[col - 2]) < f[col])
&& (abs(v - p_src[col - 1]) < f[col])
&& (abs(v - p_src[col + 1]) < f[col])
&& (abs(v - p_src[col + 2]) < f[col]))
{
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
d[col & 3] = v;
if (col >= 2)
p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
p_dst[col - 2] = d[(col - 2) & 3];
p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
src_ptr += src_pixels_per_line;
dst_ptr += dst_pixels_per_line;
}
}
static int q2mbl(int x)
{
if (x < 20) x = 20;
@ -216,108 +85,13 @@ static int q2mbl(int x)
return x * x / 3;
}
void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
{
int r, c, i;
unsigned char *s = src;
unsigned char d[16];
for (r = 0; r < rows; r++)
{
int sumsq = 0;
int sum = 0;
for (i = -8; i < 0; i++)
s[i]=s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++)
s[i+cols]=s[cols-1];
for (i = -8; i <= 6; i++)
{
sumsq += s[i] * s[i];
sum += s[i];
d[i+8] = 0;
}
for (c = 0; c < cols + 8; c++)
{
int x = s[c+7] - s[c-8];
int y = s[c+7] + s[c-8];
sum += x;
sumsq += x * y;
d[c&15] = s[c];
if (sumsq * 15 - sum * sum < flimit)
{
d[c&15] = (8 + sum + s[c]) >> 4;
}
s[c-8] = d[(c-8)&15];
}
s += pitch;
}
}
void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
{
int r, c, i;
const short *rv3 = &vp8_rv[63&rand()];
for (c = 0; c < cols; c++ )
{
unsigned char *s = &dst[c];
int sumsq = 0;
int sum = 0;
unsigned char d[16];
const short *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i < 0; i++)
s[i*pitch]=s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++)
s[(i+rows)*pitch]=s[(rows-1)*pitch];
for (i = -8; i <= 6; i++)
{
sumsq += s[i*pitch] * s[i*pitch];
sum += s[i*pitch];
}
for (r = 0; r < rows + 8; r++)
{
sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
sum += s[7*pitch] - s[-8*pitch];
d[r&15] = s[0];
if (sumsq * 15 - sum * sum < flimit)
{
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
}
if (r >= 8)
s[-8*pitch] = d[(r-8)&15];
s += pitch;
}
}
}
#if CONFIG_POSTPROC
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
}
@ -365,16 +139,16 @@ void vp8_deblock(VP8_COMMON *cm,
}
mode_info_context++;
vp8_post_proc_down_and_across_mb_row(
vpx_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
post->y_stride, source->y_width, ylimits, 16);
vp8_post_proc_down_and_across_mb_row(
vpx_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
vp8_post_proc_down_and_across_mb_row(
vpx_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
@ -409,17 +183,17 @@ void vp8_de_noise(VP8_COMMON *cm,
/* TODO: The original code don't filter the 2 outer rows and columns. */
for (mbr = 0; mbr < mb_rows; mbr++)
{
vp8_post_proc_down_and_across_mb_row(
vpx_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
source->y_buffer + 16 * mbr * source->y_stride,
source->y_stride, source->y_stride, source->y_width, limits, 16);
if (uvfilter == 1) {
vp8_post_proc_down_and_across_mb_row(
vpx_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
source->u_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits,
8);
vp8_post_proc_down_and_across_mb_row(
vpx_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
source->v_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits,

View File

@ -156,16 +156,6 @@ $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
# Postproc
#
if (vpx_config("CONFIG_POSTPROC") eq "yes") {
add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vp8_mbpost_proc_down mmx sse2 msa/;
$vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vp8_mbpost_proc_across_ip sse2 msa/;
$vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
# no asm yet

View File

@ -1,253 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern sym(vp8_rv)
global sym(vp8_mbpost_proc_down_mmx) PRIVATE
sym(vp8_mbpost_proc_down_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 136
; unsigned char d[16][8] at [rsp]
; create flimit2 at [rsp+128]
mov eax, dword ptr arg(4) ;flimit
mov [rsp+128], eax
mov [rsp+128+4], eax
%define flimit2 [rsp+128]
%if ABI_IS_32BIT=0
lea r8, [GLOBAL(sym(vp8_rv))]
%endif
;rows +=8;
add dword ptr arg(2), 8
;for(c=0; c<cols; c+=4)
.loop_col:
mov rsi, arg(0) ;s
pxor mm0, mm0 ;
movsxd rax, dword ptr arg(1) ;pitch ;
; this copies the last row down into the border 8 rows
mov rdi, rsi
mov rdx, arg(2)
sub rdx, 9
imul rdx, rax
lea rdi, [rdi+rdx]
movq mm1, QWORD ptr[rdi] ; first row
mov rcx, 8
.init_borderd: ; initialize borders
lea rdi, [rdi + rax]
movq [rdi], mm1
dec rcx
jne .init_borderd
neg rax ; rax = -pitch
; this copies the first row up into the border 8 rows
mov rdi, rsi
movq mm1, QWORD ptr[rdi] ; first row
mov rcx, 8
.init_border: ; initialize borders
lea rdi, [rdi + rax]
movq [rdi], mm1
dec rcx
jne .init_border
lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
neg rax
pxor mm5, mm5
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rdi, rsi
mov rcx, 15 ;
.loop_initvar:
movd mm1, DWORD PTR [rdi];
punpcklbw mm1, mm0 ;
paddw mm5, mm1 ;
pmullw mm1, mm1 ;
movq mm2, mm1 ;
punpcklwd mm1, mm0 ;
punpckhwd mm2, mm0 ;
paddd mm6, mm1 ;
paddd mm7, mm2 ;
lea rdi, [rdi+rax] ;
dec rcx
jne .loop_initvar
;save the var and sum
xor rdx, rdx
.loop_row:
movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
punpcklbw mm1, mm0
punpcklbw mm2, mm0
paddw mm5, mm2
psubw mm5, mm1
pmullw mm2, mm2
movq mm4, mm2
punpcklwd mm2, mm0
punpckhwd mm4, mm0
paddd mm6, mm2
paddd mm7, mm4
pmullw mm1, mm1
movq mm2, mm1
punpcklwd mm1, mm0
psubd mm6, mm1
punpckhwd mm2, mm0
psubd mm7, mm2
movq mm3, mm6
pslld mm3, 4
psubd mm3, mm6
movq mm1, mm5
movq mm4, mm5
pmullw mm1, mm1
pmulhw mm4, mm4
movq mm2, mm1
punpcklwd mm1, mm4
punpckhwd mm2, mm4
movq mm4, mm7
pslld mm4, 4
psubd mm4, mm7
psubd mm3, mm1
psubd mm4, mm2
psubd mm3, flimit2
psubd mm4, flimit2
psrad mm3, 31
psrad mm4, 31
packssdw mm3, mm4
packsswb mm3, mm0
movd mm1, DWORD PTR [rsi+rax*8]
movq mm2, mm1
punpcklbw mm1, mm0
paddw mm1, mm5
mov rcx, rdx
and rcx, 127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
push rax
lea rax, [GLOBAL(sym(vp8_rv))]
movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
pop rax
%elif ABI_IS_32BIT=0
movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
%else
movq mm4, [sym(vp8_rv) + rcx*2]
%endif
paddw mm1, mm4
psraw mm1, 4
packuswb mm1, mm0
pand mm1, mm3
pandn mm3, mm2
por mm1, mm3
and rcx, 15
movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
cmp edx, 8
jl .skip_assignment
mov rcx, rdx
sub rcx, 8
and rcx, 15
movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
movd [rsi], mm1
.skip_assignment:
lea rsi, [rsi+rax]
lea rdi, [rdi+rax]
add rdx, 1
cmp edx, dword arg(2) ;rows
jl .loop_row
add dword arg(0), 4 ; s += 4
sub dword arg(3), 4 ; cols -= 4
cmp dword arg(3), 0
jg .loop_col
add rsp, 136
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
%undef flimit2
SECTION_RODATA
align 16
Blur:
times 16 dw 16
times 8 dw 64
times 16 dw 16
times 8 dw 0
rd:
times 4 dw 0x40

View File

@ -96,9 +96,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
endif
ifeq ($(ARCH_X86_64),yes)
@ -123,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c
endif
# common (c)

View File

@ -103,6 +103,8 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) {
#if CONFIG_VP9_POSTPROC
vpx_free_frame_buffer(&cm->post_proc_buffer);
vpx_free_frame_buffer(&cm->post_proc_buffer_int);
vpx_free(cm->postproc_state.limits);
cm->postproc_state.limits = 0;
#else
(void)cm;
#endif

View File

@ -32,129 +32,9 @@ static const int16_t kernel5[] = {
1, 1, 4, 1, 1
};
const int16_t vp9_rv[] = {
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
static const uint8_t q_diff_thresh = 20;
static const uint8_t last_q_thresh = 170;
void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
uint8_t *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit) {
uint8_t const *p_src;
uint8_t *p_dst;
int row, col, i, v, kernel;
int pitch = src_pixels_per_line;
uint8_t d[8];
(void)dst_pixels_per_line;
for (row = 0; row < rows; row++) {
/* post_proc_down for one row */
p_src = src_ptr;
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
kernel = 4;
v = p_src[col];
for (i = -2; i <= 2; i++) {
if (abs(v - p_src[col + i * pitch]) > flimit)
goto down_skip_convolve;
kernel += kernel5[2 + i] * p_src[col + i * pitch];
}
v = (kernel >> 3);
down_skip_convolve:
p_dst[col] = v;
}
/* now post_proc_across */
p_src = dst_ptr;
p_dst = dst_ptr;
for (i = 0; i < 8; i++)
d[i] = p_src[i];
for (col = 0; col < cols; col++) {
kernel = 4;
v = p_src[col];
d[col & 7] = v;
for (i = -2; i <= 2; i++) {
if (abs(v - p_src[col + i]) > flimit)
goto across_skip_convolve;
kernel += kernel5[2 + i] * p_src[col + i];
}
d[col & 7] = (kernel >> 3);
across_skip_convolve:
if (col >= 2)
p_dst[col - 2] = d[(col - 2) & 7];
}
/* handle the last two pixels */
p_dst[col - 2] = d[(col - 2) & 7];
p_dst[col - 1] = d[(col - 1) & 7];
/* next row */
src_ptr += pitch;
dst_ptr += pitch;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
uint16_t *dst_ptr,
@ -237,41 +117,6 @@ static int q2mbl(int x) {
return x * x / 3;
}
void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
int rows, int cols, int flimit) {
int r, c, i;
uint8_t *s = src;
uint8_t d[16];
for (r = 0; r < rows; r++) {
int sumsq = 0;
int sum = 0;
for (i = -8; i <= 6; i++) {
sumsq += s[i] * s[i];
sum += s[i];
d[i + 8] = 0;
}
for (c = 0; c < cols + 8; c++) {
int x = s[c + 7] - s[c - 8];
int y = s[c + 7] + s[c - 8];
sum += x;
sumsq += x * y;
d[c & 15] = s[c];
if (sumsq * 15 - sum * sum < flimit) {
d[c & 15] = (8 + sum + s[c]) >> 4;
}
s[c - 8] = d[(c - 8) & 15];
}
s += pitch;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
int rows, int cols, int flimit) {
@ -312,43 +157,12 @@ void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
int rows, int cols, int flimit) {
int r, c, i;
const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
for (c = 0; c < cols; c++) {
uint8_t *s = &dst[c];
int sumsq = 0;
int sum = 0;
uint8_t d[16];
const int16_t *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i <= 6; i++) {
sumsq += s[i * pitch] * s[i * pitch];
sum += s[i * pitch];
}
for (r = 0; r < rows + 8; r++) {
sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
sum += s[7 * pitch] - s[-8 * pitch];
d[r & 15] = s[0];
if (sumsq * 15 - sum * sum < flimit) {
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
}
s[-8 * pitch] = d[(r - 8) & 15];
s += pitch;
}
}
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
int rows, int cols, int flimit) {
int r, c, i;
const int16_t *rv3 = &vp9_rv[63 & rand()]; // NOLINT
const int16_t *rv3 = &vpx_rv[63 & rand()]; // NOLINT
for (c = 0; c < cols; c++) {
uint16_t *s = &dst[c];
@ -382,14 +196,14 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
int flag) {
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
int flag,
uint8_t *limits) {
(void) low_var_thresh;
(void) flag;
#if CONFIG_VP9_HIGHBITDEPTH
if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
CONVERT_TO_SHORTPTR(post->y_buffer),
source->y_stride, post->y_stride,
@ -415,124 +229,68 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
source->uv_height, source->uv_width,
ppl);
} else {
vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
source->y_stride, post->y_stride,
source->y_height, source->y_width, ppl);
vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
#endif // CONFIG_VP9_HIGHBITDEPTH
vp9_deblock(source, post, q, limits);
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
source->uv_stride, post->uv_stride,
source->uv_height, source->uv_width, ppl);
vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
source->uv_stride, post->uv_stride,
source->uv_height, source->uv_width, ppl);
#if CONFIG_VP9_HIGHBITDEPTH
}
#else
vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
source->y_stride, post->y_stride,
source->y_height, source->y_width, ppl);
vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
source->uv_stride, post->uv_stride,
source->uv_height, source->uv_width, ppl);
vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
source->uv_stride, post->uv_stride,
source->uv_height, source->uv_width, ppl);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
int q) {
int q, uint8_t *limits) {
const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+ 0.0065 + 0.5);
int i;
const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
for (i = 0; i < MAX_MB_PLANE; ++i) {
#if CONFIG_VP9_HIGHBITDEPTH
assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
(dst->flags & YV12_FLAG_HIGHBITDEPTH));
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
int i;
const uint8_t * const srcs[3] =
{src->y_buffer, src->u_buffer, src->v_buffer};
const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
uint8_t * const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
for (i = 0; i < MAX_MB_PLANE; ++i) {
vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
CONVERT_TO_SHORTPTR(dsts[i]),
src_strides[i], dst_strides[i],
src_heights[i], src_widths[i], ppl);
} else {
vp9_post_proc_down_and_across(srcs[i], dsts[i],
src_strides[i], dst_strides[i],
src_heights[i], src_widths[i], ppl);
}
#else
vp9_post_proc_down_and_across(srcs[i], dsts[i],
src_strides[i], dst_strides[i],
src_heights[i], src_widths[i], ppl);
} else {
#endif // CONFIG_VP9_HIGHBITDEPTH
int mbr;
const int mb_rows = src->y_height / 16;
const int mb_cols = src->y_width / 16;
memset(limits, (unsigned char) ppl, 16 * mb_cols);
for (mbr = 0; mbr < mb_rows; mbr++) {
vpx_post_proc_down_and_across_mb_row(
src->y_buffer + 16 * mbr * src->y_stride,
dst->y_buffer + 16 * mbr * dst->y_stride, src->y_stride,
dst->y_stride, src->y_width, limits, 16);
vpx_post_proc_down_and_across_mb_row(
src->u_buffer + 8 * mbr * src->uv_stride,
dst->u_buffer + 8 * mbr * dst->uv_stride, src->uv_stride,
dst->uv_stride, src->uv_width, limits, 8);
vpx_post_proc_down_and_across_mb_row(
src->v_buffer + 8 * mbr * src->uv_stride,
dst->v_buffer + 8 * mbr * dst->uv_stride, src->uv_stride,
dst->uv_stride, src->uv_width, limits, 8);
}
#if CONFIG_VP9_HIGHBITDEPTH
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
int q) {
const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+ 0.0065 + 0.5);
int i;
const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
for (i = 0; i < MAX_MB_PLANE; ++i) {
const int src_stride = src_strides[i];
const int src_width = src_widths[i] - 4;
const int src_height = src_heights[i] - 4;
const int dst_stride = dst_strides[i];
#if CONFIG_VP9_HIGHBITDEPTH
assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
(dst->flags & YV12_FLAG_HIGHBITDEPTH));
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
srcs[i] + 2 * src_stride + 2);
uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
dsts[i] + 2 * dst_stride + 2);
vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
dst_stride, src_height, src_width,
ppl);
} else {
const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
dst_stride, src_height, src_width, ppl);
}
#else
const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
src_height, src_width, ppl);
#endif
}
int q, uint8_t *limits) {
vp9_deblock(src, dst, q, limits);
}
static double gaussian(double sigma, double mu, double x) {
@ -664,6 +422,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate post-processing buffer");
if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) {
if (!cm->postproc_state.limits) {
cm->postproc_state.limits = vpx_calloc(
cm->width, sizeof(*cm->postproc_state.limits));
}
}
if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
ppstate->last_frame_valid && cm->bit_depth == 8 &&
ppstate->last_base_qindex <= last_q_thresh &&
@ -678,17 +444,19 @@ int vp9_post_proc_frame(struct VP9Common *cm,
if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
q + (ppflags->deblocking_level - 5) * 10,
1, 0);
1, 0, cm->postproc_state.limits);
} else if (flags & VP9D_DEBLOCK) {
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
cm->postproc_state.limits);
} else {
vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
}
} else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
q + (ppflags->deblocking_level - 5) * 10, 1, 0);
q + (ppflags->deblocking_level - 5) * 10, 1, 0,
cm->postproc_state.limits);
} else if (flags & VP9D_DEBLOCK) {
vp9_deblock(cm->frame_to_show, ppbuf, q);
vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
} else {
vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
}

View File

@ -33,6 +33,7 @@ struct postproc_state {
DECLARE_ALIGNED(16, char, blackclamp[16]);
DECLARE_ALIGNED(16, char, whiteclamp[16]);
DECLARE_ALIGNED(16, char, bothclamp[16]);
uint8_t *limits;
};
struct VP9Common;
@ -42,9 +43,11 @@ struct VP9Common;
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
uint8_t *limits);
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
uint8_t *limits);
#ifdef __cplusplus
} // extern "C"

View File

@ -35,18 +35,6 @@ if ($opts{arch} eq "x86_64") {
# post proc
#
if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
specialize qw/vp9_mbpost_proc_down sse2/;
$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
specialize qw/vp9_mbpost_proc_across_ip sse2/;
$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
specialize qw/vp9_post_proc_down_and_across sse2/;
$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
specialize qw/vp9_filter_by_weight16x16 sse2 msa/;

View File

@ -1,632 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp9_post_proc_down_and_across_xmm
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
; int rows,
; int cols,
; int flimit
;)
global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
sym(vp9_post_proc_down_and_across_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
ALIGN_STACK 16, rax
; move the global rd onto the stack, since we don't have enough registers
; to do PIC addressing
movdqa xmm0, [GLOBAL(rd42)]
sub rsp, 16
movdqa [rsp], xmm0
%define RD42 [rsp]
%else
%define RD42 [GLOBAL(rd42)]
%endif
movd xmm2, dword ptr arg(6) ;flimit
punpcklwd xmm2, xmm2
punpckldq xmm2, xmm2
punpcklqdq xmm2, xmm2
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;dst_ptr
movsxd rcx, DWORD PTR arg(4) ;rows
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor xmm0, xmm0 ; mm0 = 00000000
.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
.nextcol:
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
psllw xmm3, 2 ;
movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
paddusw xmm3, xmm5 ; mm3 += mm6
; thresholding
movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw xmm7, xmm2
movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
neg rax
movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
paddusw xmm3, xmm4 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
paddusw xmm3, RD42 ; mm3 += round value
psraw xmm3, 3 ; mm3 /= 8
pand xmm1, xmm7 ; mm1 select vals > thresh from source
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
paddusw xmm1, xmm7 ; combination
packuswb xmm1, xmm0 ; pack to bytes
movq QWORD PTR [rdi], xmm1 ;
neg rax ; pitch is positive
add rsi, 8
add rdi, 8
add rdx, 8
cmp edx, dword arg(5) ;cols
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
xor rdx, rdx
movq mm0, QWORD PTR [rdi-8];
.acrossnextcol:
movq xmm7, QWORD PTR [rdi +rdx -2]
movd xmm4, DWORD PTR [rdi +rdx +6]
pslldq xmm4, 8
por xmm4, xmm7
movdqa xmm3, xmm4
psrldq xmm3, 2
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
psllw xmm3, 2
movdqa xmm5, xmm4
psrldq xmm5, 3
punpcklbw xmm5, xmm0 ; mm5 = p1..p4
paddusw xmm3, xmm5 ; mm3 += mm6
; thresholding
movdqa xmm7, xmm1 ; mm7 = p0..p3
psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
pcmpgtw xmm7, xmm2
movdqa xmm5, xmm4
psrldq xmm5, 4
punpcklbw xmm5, xmm0 ; mm5 = p2..p5
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = p0..p3
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
movdqa xmm5, xmm4 ; mm5 = p-2..p5
punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
paddusw xmm3, xmm5 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = p0..p3
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
psrldq xmm4, 1 ; mm4 = p-1..p5
punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
paddusw xmm3, xmm4 ; mm3 += mm5
; thresholding
movdqa xmm6, xmm1 ; mm6 = p0..p3
psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw xmm6, xmm2
por xmm7, xmm6 ; accumulate thresholds
paddusw xmm3, RD42 ; mm3 += round value
psraw xmm3, 3 ; mm3 /= 8
pand xmm1, xmm7 ; mm1 select vals > thresh from source
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
paddusw xmm1, xmm7 ; combination
packuswb xmm1, xmm0 ; pack to bytes
movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
movdq2q mm0, xmm1
add rdx, 8
cmp edx, dword arg(5) ;cols
jl .acrossnextcol;
; last 8 pixels
movq QWORD PTR [rdi+rdx-8], mm0
; done with this rwo
add rsi,rax ; next line
mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
add rdi,rax ; next destination
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
jnz .nextrow ; next row
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
add rsp,16
pop rsp
%endif
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%undef RD42
;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern sym(vp9_rv)
global sym(vp9_mbpost_proc_down_xmm) PRIVATE
sym(vp9_mbpost_proc_down_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 128+16
; unsigned char d[16][8] at [rsp]
; create flimit2 at [rsp+128]
mov eax, dword ptr arg(4) ;flimit
mov [rsp+128], eax
mov [rsp+128+4], eax
mov [rsp+128+8], eax
mov [rsp+128+12], eax
%define flimit4 [rsp+128]
%if ABI_IS_32BIT=0
lea r8, [GLOBAL(sym(vp9_rv))]
%endif
;rows +=8;
add dword arg(2), 8
;for(c=0; c<cols; c+=8)
.loop_col:
mov rsi, arg(0) ; s
pxor xmm0, xmm0 ;
movsxd rax, dword ptr arg(1) ;pitch ;
neg rax ; rax = -pitch
lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
neg rax
pxor xmm5, xmm5
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
mov rdi, rsi
mov rcx, 15 ;
.loop_initvar:
movq xmm1, QWORD PTR [rdi];
punpcklbw xmm1, xmm0 ;
paddw xmm5, xmm1 ;
pmullw xmm1, xmm1 ;
movdqa xmm2, xmm1 ;
punpcklwd xmm1, xmm0 ;
punpckhwd xmm2, xmm0 ;
paddd xmm6, xmm1 ;
paddd xmm7, xmm2 ;
lea rdi, [rdi+rax] ;
dec rcx
jne .loop_initvar
;save the var and sum
xor rdx, rdx
.loop_row:
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
paddw xmm5, xmm2
psubw xmm5, xmm1
pmullw xmm2, xmm2
movdqa xmm4, xmm2
punpcklwd xmm2, xmm0
punpckhwd xmm4, xmm0
paddd xmm6, xmm2
paddd xmm7, xmm4
pmullw xmm1, xmm1
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
psubd xmm6, xmm1
punpckhwd xmm2, xmm0
psubd xmm7, xmm2
movdqa xmm3, xmm6
pslld xmm3, 4
psubd xmm3, xmm6
movdqa xmm1, xmm5
movdqa xmm4, xmm5
pmullw xmm1, xmm1
pmulhw xmm4, xmm4
movdqa xmm2, xmm1
punpcklwd xmm1, xmm4
punpckhwd xmm2, xmm4
movdqa xmm4, xmm7
pslld xmm4, 4
psubd xmm4, xmm7
psubd xmm3, xmm1
psubd xmm4, xmm2
psubd xmm3, flimit4
psubd xmm4, flimit4
psrad xmm3, 31
psrad xmm4, 31
packssdw xmm3, xmm4
packsswb xmm3, xmm0
movq xmm1, QWORD PTR [rsi+rax*8]
movq xmm2, xmm1
punpcklbw xmm1, xmm0
paddw xmm1, xmm5
mov rcx, rdx
and rcx, 127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
push rax
lea rax, [GLOBAL(sym(vp9_rv))]
movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2]
pop rax
%elif ABI_IS_32BIT=0
movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
%else
movdqu xmm4, [sym(vp9_rv) + rcx*2]
%endif
paddw xmm1, xmm4
;paddw xmm1, eight8s
psraw xmm1, 4
packuswb xmm1, xmm0
pand xmm1, xmm3
pandn xmm3, xmm2
por xmm1, xmm3
and rcx, 15
movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
mov rcx, rdx
sub rcx, 8
and rcx, 15
movq mm0, [rsp + rcx*8] ;d[rcx*8]
movq [rsi], mm0
lea rsi, [rsi+rax]
lea rdi, [rdi+rax]
add rdx, 1
cmp edx, dword arg(2) ;rows
jl .loop_row
add dword arg(0), 8 ; s += 8
sub dword arg(3), 8 ; cols -= 8
cmp dword arg(3), 0
jg .loop_col
add rsp, 128+16
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%undef flimit4
;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
; int pitch, int rows, int cols,int flimit)
global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
sym(vp9_mbpost_proc_across_ip_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16
; create flimit4 at [rsp]
mov eax, dword ptr arg(4) ;flimit
mov [rsp], eax
mov [rsp+4], eax
mov [rsp+8], eax
mov [rsp+12], eax
%define flimit4 [rsp]
;for(r=0;r<rows;r++)
.ip_row_loop:
xor rdx, rdx ;sumsq=0;
xor rcx, rcx ;sum=0;
mov rsi, arg(0); s
mov rdi, -8
.ip_var_loop:
;for(i=-8;i<=6;i++)
;{
; sumsq += s[i]*s[i];
; sum += s[i];
;}
movzx eax, byte [rsi+rdi]
add ecx, eax
mul al
add edx, eax
add rdi, 1
cmp rdi, 6
jle .ip_var_loop
;mov rax, sumsq
;movd xmm7, rax
movd xmm7, edx
;mov rax, sum
;movd xmm6, rax
movd xmm6, ecx
mov rsi, arg(0) ;s
xor rcx, rcx
movsxd rdx, dword arg(3) ;cols
add rdx, 8
pxor mm0, mm0
pxor mm1, mm1
pxor xmm0, xmm0
.nextcol4:
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
punpcklbw xmm1, xmm0 ; expanding
punpcklbw xmm2, xmm0 ; expanding
punpcklwd xmm1, xmm0 ; expanding to dwords
punpcklwd xmm2, xmm0 ; expanding to dwords
psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
paddd xmm6, xmm2
paddd xmm7, xmm1
pshufd xmm6, xmm6, 0 ; duplicate the last ones
pshufd xmm7, xmm7, 0 ; duplicate the last ones
psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
paddd xmm6, xmm4
paddd xmm7, xmm3
pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
paddd xmm7, xmm3
paddd xmm6, xmm4
pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
paddd xmm7, xmm3
paddd xmm6, xmm4
movdqa xmm3, xmm6
pmaddwd xmm3, xmm3
movdqa xmm5, xmm7
pslld xmm5, 4
psubd xmm5, xmm7
psubd xmm5, xmm3
psubd xmm5, flimit4
psrad xmm5, 31
packssdw xmm5, xmm0
packsswb xmm5, xmm0
movd xmm1, DWORD PTR [rsi+rcx]
movq xmm2, xmm1
punpcklbw xmm1, xmm0
punpcklwd xmm1, xmm0
paddd xmm1, xmm6
paddd xmm1, [GLOBAL(four8s)]
psrad xmm1, 4
packssdw xmm1, xmm0
packuswb xmm1, xmm0
pand xmm1, xmm5
pandn xmm5, xmm2
por xmm5, xmm1
movd [rsi+rcx-8], mm0
movq mm0, mm1
movdq2q mm1, xmm5
psrldq xmm7, 12
psrldq xmm6, 12
add rcx, 4
cmp rcx, rdx
jl .nextcol4
;s+=pitch;
movsxd rax, dword arg(1)
add arg(0), rax
sub dword arg(2), 1 ;rows-=1
cmp dword arg(2), 0
jg .ip_row_loop
add rsp, 16
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
%undef flimit4
SECTION_RODATA
align 16
rd42:
times 8 dw 0x04
four8s:
times 4 dd 8

View File

@ -3114,7 +3114,11 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
l = 150;
break;
}
vp9_denoise(cpi->Source, cpi->Source, l);
if (!cpi->common.postproc_state.limits) {
cpi->common.postproc_state.limits = vpx_calloc(
cpi->common.width, sizeof(*cpi->common.postproc_state.limits));
}
vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits);
}
#endif // CONFIG_VP9_POSTPROC
}
@ -4914,7 +4918,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
vp9_deblock(cm->frame_to_show, pp,
cm->lf.filter_level * 10 / 6);
cm->lf.filter_level * 10 / 6, cm->postproc_state.limits);
#endif
vpx_clear_system_state();

View File

@ -67,7 +67,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

204
vpx_dsp/deblock.c Normal file
View File

@ -0,0 +1,204 @@
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line, int cols,
unsigned char *f, int size) {
unsigned char *p_src, *p_dst;
int row;
int col;
unsigned char v;
unsigned char d[4];
for (row = 0; row < size; row++) {
/* post_proc_down for one row */
p_src = src_ptr;
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
unsigned char p_above1 = p_src[col - src_pixels_per_line];
unsigned char p_below1 = p_src[col + src_pixels_per_line];
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
v = p_src[col];
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
p_dst[col] = v;
}
/* now post_proc_across */
p_src = dst_ptr;
p_dst = dst_ptr;
p_src[-2] = p_src[-1] = p_src[0];
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++) {
v = p_src[col];
if ((abs(v - p_src[col - 2]) < f[col])
&& (abs(v - p_src[col - 1]) < f[col])
&& (abs(v - p_src[col + 1]) < f[col])
&& (abs(v - p_src[col + 2]) < f[col])) {
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
d[col & 3] = v;
if (col >= 2)
p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
p_dst[col - 2] = d[(col - 2) & 3];
p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
src_ptr += src_pixels_per_line;
dst_ptr += dst_pixels_per_line;
}
}
void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
int cols, int flimit) {
int r, c, i;
unsigned char *s = src;
unsigned char d[16];
for (r = 0; r < rows; r++) {
int sumsq = 0;
int sum = 0;
for (i = -8; i < 0; i++)
s[i] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++)
s[i + cols] = s[cols - 1];
for (i = -8; i <= 6; i++) {
sumsq += s[i] * s[i];
sum += s[i];
d[i + 8] = 0;
}
for (c = 0; c < cols + 8; c++) {
int x = s[c + 7] - s[c - 8];
int y = s[c + 7] + s[c - 8];
sum += x;
sumsq += x * y;
d[c & 15] = s[c];
if (sumsq * 15 - sum * sum < flimit) {
d[c & 15] = (8 + sum + s[c]) >> 4;
}
s[c - 8] = d[(c - 8) & 15];
}
s += pitch;
}
}
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
int flimit) {
int r, c, i;
unsigned int seed;
const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
for (c = 0; c < cols; c++) {
unsigned char *s = &dst[c];
int sumsq = 0;
int sum = 0;
unsigned char d[16];
const int16_t *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i < 0; i++)
s[i * pitch] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++)
s[(i + rows) * pitch] = s[(rows - 1) * pitch];
for (i = -8; i <= 6; i++) {
sumsq += s[i * pitch] * s[i * pitch];
sum += s[i * pitch];
}
for (r = 0; r < rows + 8; r++) {
sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
sum += s[7 * pitch] - s[-8 * pitch];
d[r & 15] = s[0];
if (sumsq * 15 - sum * sum < flimit) {
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
}
if (r >= 8)
s[-8 * pitch] = d[(r - 8) & 15];
s += pitch;
}
}
}
#if CONFIG_POSTPROC
static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
int q) {
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
}
#endif

683
vpx_dsp/mips/deblock_msa.c Normal file
View File

@ -0,0 +1,683 @@
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "./macros_msa.h"
extern int16_t vpx_rv[];
#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, \
out4, out5, out6, out7, \
out8, out9, out10, out11, \
out12, out13, out14, out15) \
{ \
v8i16 temp0, temp1, temp2, temp3, temp4; \
v8i16 temp5, temp6, temp7, temp8, temp9; \
\
ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
temp0, temp1, temp2, temp3); \
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
temp0, temp1, temp2, temp3); \
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_UB(temp5, temp4, out8, out10); \
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_UB(temp5, temp4, out12, out14); \
out0 = (v16u8)temp6; \
out2 = (v16u8)temp7; \
out4 = (v16u8)temp8; \
out6 = (v16u8)temp9; \
out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
}
#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
below1_in, below2_in, ref, out) \
{ \
v16u8 temp0, temp1; \
\
temp1 = __msa_aver_u_b(above2_in, above1_in); \
temp0 = __msa_aver_u_b(below2_in, below1_in); \
temp1 = __msa_aver_u_b(temp1, temp0); \
out = __msa_aver_u_b(src_in, temp1); \
temp0 = __msa_asub_u_b(src_in, above2_in); \
temp1 = __msa_asub_u_b(src_in, above1_in); \
temp0 = (temp0 < ref); \
temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \
temp1 = __msa_asub_u_b(src_in, below1_in); \
temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \
temp1 = __msa_asub_u_b(src_in, below2_in); \
temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \
out = __msa_bmz_v(out, src_in, temp0); \
}
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15) \
{ \
v8i16 temp0, temp1, temp2, temp3, temp4; \
v8i16 temp5, temp6, temp7, temp8, temp9; \
\
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
temp2, temp3, temp4, temp5); \
ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
temp6, temp7, temp8, temp9); \
ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
}
#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
in6, in7, in8, in9, in10, in11) \
{ \
v8i16 temp0, temp1, temp2, temp3; \
v8i16 temp4, temp5, temp6, temp7; \
\
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
temp4 = __msa_ilvr_h(temp5, temp4); \
ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
temp5 = __msa_ilvr_h(temp7, temp6); \
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
in0 = (v16u8)temp0; \
in2 = (v16u8)temp1; \
in4 = (v16u8)temp2; \
in6 = (v16u8)temp3; \
in8 = (v16u8)temp6; \
in10 = (v16u8)temp7; \
in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
}
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
int32_t src_stride,
int32_t dst_stride, int32_t cols,
uint8_t *f) {
uint8_t *p_src = src_ptr;
uint8_t *p_dst = dst_ptr;
uint8_t *f_orig = f;
uint8_t *p_dst_st = dst_ptr;
uint16_t col;
uint64_t out0, out1, out2, out3;
v16u8 above2, above1, below2, below1, src, ref, ref_temp;
v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
for (col = (cols / 16); col--;) {
ref = LD_UB(f);
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
src = LD_UB(p_src);
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
above2 = LD_UB(p_src + 3 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
above1 = LD_UB(p_src + 4 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
src = LD_UB(p_src + 5 * src_stride);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
below1 = LD_UB(p_src + 6 * src_stride);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
below2 = LD_UB(p_src + 7 * src_stride);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
above2 = LD_UB(p_src + 8 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
p_dst, dst_stride);
p_dst += 16;
p_src += 16;
f += 16;
}
if (0 != (cols / 16)) {
ref = LD_UB(f);
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
src = LD_UB(p_src);
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
above2 = LD_UB(p_src + 3 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
above1 = LD_UB(p_src + 4 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
src = LD_UB(p_src + 5 * src_stride);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
below1 = LD_UB(p_src + 6 * src_stride);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
below2 = LD_UB(p_src + 7 * src_stride);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
above2 = LD_UB(p_src + 8 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
out0 = __msa_copy_u_d((v2i64) inter0, 0);
out1 = __msa_copy_u_d((v2i64) inter1, 0);
out2 = __msa_copy_u_d((v2i64) inter2, 0);
out3 = __msa_copy_u_d((v2i64) inter3, 0);
SD4(out0, out1, out2, out3, p_dst, dst_stride);
out0 = __msa_copy_u_d((v2i64) inter4, 0);
out1 = __msa_copy_u_d((v2i64) inter5, 0);
out2 = __msa_copy_u_d((v2i64) inter6, 0);
out3 = __msa_copy_u_d((v2i64) inter7, 0);
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
}
f = f_orig;
p_dst = dst_ptr - 2;
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
inter6, inter7);
for (col = 0; col < (cols / 8); ++col) {
ref = LD_UB(f);
f += 8;
VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
inter6, inter7, inter8, inter9, inter10, inter11);
if (0 == col) {
above2 = inter2;
above1 = inter2;
} else {
above2 = inter0;
above1 = inter1;
}
src = inter2;
below1 = inter3;
below2 = inter4;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
above2 = inter5;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
above1 = inter6;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
src = inter7;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
below1 = inter8;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
below2 = inter9;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
if (col == (cols / 8 - 1)) {
above2 = inter9;
} else {
above2 = inter10;
}
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
if (col == (cols / 8 - 1)) {
above1 = inter9;
} else {
above1 = inter11;
}
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
inter9, inter2, inter3, inter4, inter5, inter6, inter7,
inter8, inter9);
p_dst += 8;
LD_UB2(p_dst, dst_stride, inter0, inter1);
ST8x1_UB(inter2, p_dst_st);
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
p_dst_st += 8;
}
}
static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
int32_t src_stride,
int32_t dst_stride, int32_t cols,
uint8_t *f) {
uint8_t *p_src = src_ptr;
uint8_t *p_dst = dst_ptr;
uint8_t *p_dst_st = dst_ptr;
uint8_t *f_orig = f;
uint16_t col;
v16u8 above2, above1, below2, below1;
v16u8 src, ref, ref_temp;
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
v16u8 inter7, inter8, inter9, inter10, inter11;
v16u8 inter12, inter13, inter14, inter15;
for (col = (cols / 16); col--;) {
ref = LD_UB(f);
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
src = LD_UB(p_src);
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
above2 = LD_UB(p_src + 3 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
above1 = LD_UB(p_src + 4 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
src = LD_UB(p_src + 5 * src_stride);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
below1 = LD_UB(p_src + 6 * src_stride);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
below2 = LD_UB(p_src + 7 * src_stride);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
above2 = LD_UB(p_src + 8 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
src = LD_UB(p_src + 10 * src_stride);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
below1 = LD_UB(p_src + 11 * src_stride);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
below2 = LD_UB(p_src + 12 * src_stride);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
above2 = LD_UB(p_src + 13 * src_stride);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
above1 = LD_UB(p_src + 14 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
src = LD_UB(p_src + 15 * src_stride);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
below1 = LD_UB(p_src + 16 * src_stride);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
below2 = LD_UB(p_src + 17 * src_stride);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
p_dst, dst_stride);
ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
p_dst + 8 * dst_stride, dst_stride);
p_src += 16;
p_dst += 16;
f += 16;
}
f = f_orig;
p_dst = dst_ptr - 2;
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
inter6, inter7);
LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
inter12, inter13, inter14, inter15);
for (col = 0; col < cols / 8; ++col) {
ref = LD_UB(f);
f += 8;
TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
inter7, inter8, inter9, inter10, inter11, inter12, inter13,
inter14, inter15);
if (0 == col) {
above2 = inter2;
above1 = inter2;
} else {
above2 = inter0;
above1 = inter1;
}
src = inter2;
below1 = inter3;
below2 = inter4;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
above2 = inter5;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
above1 = inter6;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
src = inter7;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
below1 = inter8;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
below2 = inter9;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
if (col == (cols / 8 - 1)) {
above2 = inter9;
} else {
above2 = inter10;
}
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
if (col == (cols / 8 - 1)) {
above1 = inter9;
} else {
above1 = inter11;
}
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
inter8, inter9, inter2, inter3, inter4, inter5,
inter6, inter7, inter8, inter9, inter10, inter11,
inter12, inter13, inter14, inter15, above2, above1);
p_dst += 8;
LD_UB2(p_dst, dst_stride, inter0, inter1);
ST8x1_UB(inter2, p_dst_st);
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
p_dst_st += 8;
}
}
void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
int32_t src_stride,
int32_t dst_stride, int32_t cols,
uint8_t *f, int32_t size) {
if (8 == size) {
postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
} else if (16 == size) {
postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
}
}
void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
int32_t rows, int32_t cols, int32_t flimit) {
int32_t row, col, cnt;
uint8_t *src_dup = src_ptr;
v16u8 src0, src, tmp_orig;
v16u8 tmp = {0};
v16i8 zero = {0};
v8u16 sum_h, src_r_h, src_l_h;
v4u32 src_r_w, src_l_w;
v4i32 flimit_vec;
flimit_vec = __msa_fill_w(flimit);
for (row = rows; row--;) {
int32_t sum_sq = 0;
int32_t sum = 0;
src0 = (v16u8) __msa_fill_b(src_dup[0]);
ST8x1_UB(src0, (src_dup - 8));
src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
ST_UB(src0, src_dup + cols);
src_dup[cols + 16] = src_dup[cols - 1];
tmp_orig = (v16u8) __msa_ldi_b(0);
tmp_orig[15] = tmp[15];
src = LD_UB(src_dup - 8);
src[15] = 0;
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
sum_sq = HADD_SW_S32(src_r_w);
sum_sq += HADD_SW_S32(src_l_w);
sum_h = __msa_hadd_u_h(src, src);
sum = HADD_UH_U32(sum_h);
{
v16u8 src7, src8, src_r, src_l;
v16i8 mask;
v8u16 add_r, add_l;
v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
v4i32 sub0, sub1, sub2, sub3;
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
v4i32 mul0, mul1, mul2, mul3;
v4i32 total0, total1, total2, total3;
v8i16 const8 = __msa_fill_h(8);
src7 = LD_UB(src_dup + 7);
src8 = LD_UB(src_dup - 8);
for (col = 0; col < (cols >> 4); ++col) {
ILVRL_B2_UB(src7, src8, src_r, src_l);
HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
sum_r[0] = sum + sub_r[0];
for (cnt = 0; cnt < 7; ++cnt) {
sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
}
sum_l[0] = sum_r[7] + sub_l[0];
for (cnt = 0; cnt < 7; ++cnt) {
sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
}
sum = sum_l[7];
src = LD_UB(src_dup + 16 * col);
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
HADD_UB2_UH(src_r, src_l, add_r, add_l);
UNPCK_SH_SW(sub_r, sub0, sub1);
UNPCK_SH_SW(sub_l, sub2, sub3);
ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
mul2, mul3);
sum_sq0[0] = sum_sq + mul0[0];
for (cnt = 0; cnt < 3; ++cnt) {
sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
}
sum_sq1[0] = sum_sq0[3] + mul1[0];
for (cnt = 0; cnt < 3; ++cnt) {
sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
}
sum_sq2[0] = sum_sq1[3] + mul2[0];
for (cnt = 0; cnt < 3; ++cnt) {
sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
}
sum_sq3[0] = sum_sq2[3] + mul3[0];
for (cnt = 0; cnt < 3; ++cnt) {
sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
}
sum_sq = sum_sq3[3];
UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
total0 = sum_sq0 * __msa_ldi_w(15);
total0 -= sum0_w * sum0_w;
total1 = sum_sq1 * __msa_ldi_w(15);
total1 -= sum1_w * sum1_w;
total2 = sum_sq2 * __msa_ldi_w(15);
total2 -= sum2_w * sum2_w;
total3 = sum_sq3 * __msa_ldi_w(15);
total3 -= sum3_w * sum3_w;
total0 = (total0 < flimit_vec);
total1 = (total1 < flimit_vec);
total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
if (col == 0) {
uint64_t src_d;
src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
SD(src_d, (src_dup - 8));
}
src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
ST_UB(tmp, (src_dup + (16 * col)));
}
src_dup += pitch;
}
}
}
void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
int32_t cols, int32_t flimit) {
int32_t row, col, cnt, i;
unsigned int seed;
const int16_t *rv3 = &vpx_rv[63 & rand_r(&seed)];
v4i32 flimit_vec;
v16u8 dst7, dst8, dst_r_b, dst_l_b;
v16i8 mask;
v8u16 add_r, add_l;
v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
flimit_vec = __msa_fill_w(flimit);
for (col = 0; col < (cols >> 4); ++col) {
uint8_t *dst_tmp = &dst_ptr[col << 4];
v16u8 dst;
v16i8 zero = {0};
v16u8 tmp[16];
v8i16 mult0, mult1, rv2_0, rv2_1;
v8i16 sum0_h = {0};
v8i16 sum1_h = {0};
v4i32 mul0 = {0};
v4i32 mul1 = {0};
v4i32 mul2 = {0};
v4i32 mul3 = {0};
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
v4i32 add0, add1, add2, add3;
const int16_t *rv2[16];
dst = LD_UB(dst_tmp);
for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
rv2[i] = rv3 + ((cnt * 17) & 127);
++i;
}
for (cnt = -8; cnt < 0; ++cnt) {
ST_UB(dst, dst_tmp + cnt * pitch);
}
dst = LD_UB((dst_tmp + (rows - 1) * pitch));
for (cnt = rows; cnt < rows + 17; ++cnt) {
ST_UB(dst, dst_tmp + cnt * pitch);
}
for (cnt = -8; cnt <= 6; ++cnt) {
dst = LD_UB(dst_tmp + (cnt * pitch));
UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
}
for (row = 0; row < (rows + 8); ++row) {
for (i = 0; i < 8; ++i) {
rv2_0[i] = *(rv2[i] + (row & 127));
rv2_1[i] = *(rv2[i + 8] + (row & 127));
}
dst7 = LD_UB(dst_tmp + (7 * pitch));
dst8 = LD_UB(dst_tmp - (8 * pitch));
ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
UNPCK_SH_SW(sub_r, sub0, sub1);
UNPCK_SH_SW(sub_l, sub2, sub3);
sum0_h += sub_r;
sum1_h += sub_l;
HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
ILVRL_H2_SW(zero, add_r, add0, add1);
ILVRL_H2_SW(zero, add_l, add2, add3);
mul0 += add0 * sub0;
mul1 += add1 * sub1;
mul2 += add2 * sub2;
mul3 += add3 * sub3;
dst = LD_UB(dst_tmp);
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
total0 = mul0 * __msa_ldi_w(15);
total0 -= sum0_w * sum0_w;
total1 = mul1 * __msa_ldi_w(15);
total1 -= sum1_w * sum1_w;
total2 = mul2 * __msa_ldi_w(15);
total2 -= sum2_w * sum2_w;
total3 = mul3 * __msa_ldi_w(15);
total3 -= sum3_w * sum3_w;
total0 = (total0 < flimit_vec);
total1 = (total1 < flimit_vec);
total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
if (row >= 8) {
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
}
dst_tmp += pitch;
}
}
}

View File

@ -1060,6 +1060,7 @@
ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
}
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
/* Description : Interleave left half of halfword elements from vectors
@ -1074,6 +1075,7 @@
out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
}
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
/* Description : Interleave left half of word elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@ -1137,6 +1139,7 @@
out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
}
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) { \
@ -1215,6 +1218,7 @@
out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
}
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)

View File

@ -52,8 +52,11 @@ endif # CONFIG_VP9_HIGHBITDEPTH
ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
DSP_SRCS-yes += add_noise.c
DSP_SRCS-yes += deblock.c
DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
endif # CONFIG_POSTPROC
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)

View File

@ -1894,6 +1894,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
specialize qw/vpx_plane_add_noise sse2 msa/;
add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vpx_mbpost_proc_down sse2 msa/;
$vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
$vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
}
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

View File

@ -83,7 +83,7 @@
add rbx, 16
%endmacro
;void vp8_post_proc_down_and_across_mb_row_sse2
;void vpx_post_proc_down_and_across_mb_row_sse2
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
@ -93,8 +93,8 @@
; int *flimits,
; int size
;)
global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
sym(vp8_post_proc_down_and_across_mb_row_sse2):
global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
sym(vpx_post_proc_down_and_across_mb_row_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@ -230,11 +230,11 @@ sym(vp8_post_proc_down_and_across_mb_row_sse2):
ret
%undef flimit
;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern sym(vp8_rv)
global sym(vp8_mbpost_proc_down_xmm) PRIVATE
sym(vp8_mbpost_proc_down_xmm):
extern sym(vpx_rv)
global sym(vpx_mbpost_proc_down_xmm) PRIVATE
sym(vpx_mbpost_proc_down_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -257,7 +257,7 @@ sym(vp8_mbpost_proc_down_xmm):
%define flimit4 [rsp+128]
%if ABI_IS_32BIT=0
lea r8, [GLOBAL(sym(vp8_rv))]
lea r8, [GLOBAL(sym(vpx_rv))]
%endif
;rows +=8;
@ -403,13 +403,13 @@ sym(vp8_mbpost_proc_down_xmm):
and rcx, 127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
push rax
lea rax, [GLOBAL(sym(vp8_rv))]
movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
lea rax, [GLOBAL(sym(vpx_rv))]
movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
pop rax
%elif ABI_IS_32BIT=0
movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
%else
movdqu xmm4, [sym(vp8_rv) + rcx*2]
movdqu xmm4, [sym(vpx_rv) + rcx*2]
%endif
paddw xmm1, xmm4
@ -462,10 +462,10 @@ sym(vp8_mbpost_proc_down_xmm):
%undef flimit4
;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
; int pitch, int rows, int cols,int flimit)
global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
sym(vp8_mbpost_proc_across_ip_xmm):
global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
sym(vpx_mbpost_proc_across_ip_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5