mips msa vp8 temporal filter optimization
average improvement ~2x-3x Change-Id: I05593bed583234dc7809aaec6cab82773a29505d
This commit is contained in:
parent
0e3f494b21
commit
8fbc641540
@ -24,6 +24,10 @@
|
|||||||
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
|
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
|
||||||
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
|
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
|
||||||
|
#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
|
||||||
|
#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
|
#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
|
||||||
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
|
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
|
||||||
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
|
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
|
||||||
@ -32,6 +36,9 @@
|
|||||||
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
|
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
|
||||||
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
|
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
|
||||||
|
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
#if (__mips_isa_rev >= 6)
|
#if (__mips_isa_rev >= 6)
|
||||||
#define LW(psrc) \
|
#define LW(psrc) \
|
||||||
({ \
|
({ \
|
||||||
@ -337,6 +344,17 @@
|
|||||||
}
|
}
|
||||||
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
|
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Load 2 vectors of signed word elements with stride
|
||||||
|
Arguments : Inputs - psrc, stride
|
||||||
|
Outputs - out0, out1
|
||||||
|
Return Type - signed word
|
||||||
|
*/
|
||||||
|
#define LD_SW2(psrc, stride, out0, out1) \
|
||||||
|
{ \
|
||||||
|
out0 = LD_SW((psrc)); \
|
||||||
|
out1 = LD_SW((psrc) + stride); \
|
||||||
|
}
|
||||||
|
|
||||||
/* Description : Store vectors of 16 byte elements with stride
|
/* Description : Store vectors of 16 byte elements with stride
|
||||||
Arguments : Inputs - in0, in1, pdst, stride
|
Arguments : Inputs - in0, in1, pdst, stride
|
||||||
Details : Store 16 byte elements from 'in0' to (pdst)
|
Details : Store 16 byte elements from 'in0' to (pdst)
|
||||||
@ -377,6 +395,17 @@
|
|||||||
}
|
}
|
||||||
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
|
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Store vectors of word elements with stride
|
||||||
|
Arguments : Inputs - in0, in1, pdst, stride
|
||||||
|
Details : Store 4 word elements from 'in0' to (pdst)
|
||||||
|
Store 4 word elements from 'in1' to (pdst + stride)
|
||||||
|
*/
|
||||||
|
#define ST_SW2(in0, in1, pdst, stride) \
|
||||||
|
{ \
|
||||||
|
ST_SW(in0, (pdst)); \
|
||||||
|
ST_SW(in1, (pdst) + stride); \
|
||||||
|
}
|
||||||
|
|
||||||
/* Description : Store 2x4 byte block to destination memory from input vector
|
/* Description : Store 2x4 byte block to destination memory from input vector
|
||||||
Arguments : Inputs - in, stidx, pdst, stride
|
Arguments : Inputs - in, stidx, pdst, stride
|
||||||
Details : Index 'stidx' halfword element from 'in' vector is copied to
|
Details : Index 'stidx' halfword element from 'in' vector is copied to
|
||||||
@ -1099,6 +1128,38 @@
|
|||||||
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
|
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
|
||||||
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
|
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Maximum values between signed elements of vector and
|
||||||
|
5-bit signed immediate value are copied to the output vector
|
||||||
|
Arguments : Inputs - in0, in1, in2, in3, max_val
|
||||||
|
Outputs - in place operation
|
||||||
|
Return Type - unsigned halfword
|
||||||
|
Details : Maximum of signed halfword element values from 'in0' and
|
||||||
|
'max_val' are written in place
|
||||||
|
*/
|
||||||
|
#define MAXI_SH2(RTYPE, in0, in1, max_val) \
|
||||||
|
{ \
|
||||||
|
in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
|
||||||
|
in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
|
||||||
|
}
|
||||||
|
#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Saturate the halfword element values to the max
|
||||||
|
unsigned value of (sat_val + 1) bits
|
||||||
|
The element data width remains unchanged
|
||||||
|
Arguments : Inputs - in0, in1, sat_val
|
||||||
|
Outputs - in place operation
|
||||||
|
Return Type - as per RTYPE
|
||||||
|
Details : Each unsigned halfword element from 'in0' is saturated to the
|
||||||
|
value generated with (sat_val + 1) bit range.
|
||||||
|
The results are written in place
|
||||||
|
*/
|
||||||
|
#define SAT_UH2(RTYPE, in0, in1, sat_val) \
|
||||||
|
{ \
|
||||||
|
in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
|
||||||
|
in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
|
||||||
|
}
|
||||||
|
#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Saturate the halfword element values to the max
|
/* Description : Saturate the halfword element values to the max
|
||||||
unsigned value of (sat_val + 1) bits
|
unsigned value of (sat_val + 1) bits
|
||||||
The element data width remains unchanged
|
The element data width remains unchanged
|
||||||
@ -1323,6 +1384,29 @@
|
|||||||
in3 = in3 >> shift; \
|
in3 = in3 >> shift; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Description : Shift right arithmetic rounded words
|
||||||
|
Arguments : Inputs - in0, in1, shift
|
||||||
|
Outputs - in place operation
|
||||||
|
Return Type - as per RTYPE
|
||||||
|
Details : Each element of vector 'in0' is shifted right arithmetically by
|
||||||
|
the number of bits in the corresponding element in the vector
|
||||||
|
'shift'. The last discarded bit is added to shifted value for
|
||||||
|
rounding and the result is written in-place.
|
||||||
|
'shift' is a vector.
|
||||||
|
*/
|
||||||
|
#define SRAR_W2(RTYPE, in0, in1, shift) \
|
||||||
|
{ \
|
||||||
|
in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
|
||||||
|
in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
|
||||||
|
{ \
|
||||||
|
SRAR_W2(RTYPE, in0, in1, shift); \
|
||||||
|
SRAR_W2(RTYPE, in2, in3, shift); \
|
||||||
|
}
|
||||||
|
#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Shift right arithmetic rounded (immediate)
|
/* Description : Shift right arithmetic rounded (immediate)
|
||||||
Arguments : Inputs - in0, in1, shift
|
Arguments : Inputs - in0, in1, shift
|
||||||
Outputs - in place operation
|
Outputs - in place operation
|
||||||
@ -1408,6 +1492,14 @@
|
|||||||
out0 = in0 - in1; \
|
out0 = in0 - in1; \
|
||||||
out1 = in2 - in3; \
|
out1 = in2 - in3; \
|
||||||
}
|
}
|
||||||
|
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||||
|
out0, out1, out2, out3) \
|
||||||
|
{ \
|
||||||
|
out0 = in0 - in1; \
|
||||||
|
out1 = in2 - in3; \
|
||||||
|
out2 = in4 - in5; \
|
||||||
|
out3 = in6 - in7; \
|
||||||
|
}
|
||||||
|
|
||||||
/* Description : Sign extend halfword elements from right half of the vector
|
/* Description : Sign extend halfword elements from right half of the vector
|
||||||
Arguments : Input - in (halfword vector)
|
Arguments : Input - in (halfword vector)
|
||||||
|
@ -326,7 +326,7 @@ $vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
|
|||||||
#
|
#
|
||||||
if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
|
if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
|
||||||
add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
|
add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
|
||||||
specialize qw/vp8_temporal_filter_apply sse2/;
|
specialize qw/vp8_temporal_filter_apply sse2 msa/;
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
303
vp8/encoder/mips/msa/temporal_filter_msa.c
Normal file
303
vp8/encoder/mips/msa/temporal_filter_msa.c
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "./vp8_rtcd.h"
|
||||||
|
#include "vp8/common/mips/msa/vp8_macros_msa.h"
|
||||||
|
|
||||||
|
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
|
||||||
|
uint32_t stride,
|
||||||
|
uint8_t *frame2_ptr,
|
||||||
|
int32_t strength_in,
|
||||||
|
int32_t filter_wt_in,
|
||||||
|
uint32_t *acc, uint16_t *cnt)
|
||||||
|
{
|
||||||
|
uint32_t row;
|
||||||
|
v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
|
||||||
|
v16u8 frame_l, frame_h;
|
||||||
|
v16i8 zero = { 0 };
|
||||||
|
v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
|
||||||
|
v8i16 diff0, diff1, cnt0, cnt1;
|
||||||
|
v4i32 const3, const16, filter_wt, strength;
|
||||||
|
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
|
||||||
|
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
|
||||||
|
v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
|
||||||
|
v4i32 acc0, acc1, acc2, acc3;
|
||||||
|
|
||||||
|
filter_wt = __msa_fill_w(filter_wt_in);
|
||||||
|
strength = __msa_fill_w(strength_in);
|
||||||
|
const3 = __msa_ldi_w(3);
|
||||||
|
const16 = __msa_ldi_w(16);
|
||||||
|
|
||||||
|
for (row = 8; row--;)
|
||||||
|
{
|
||||||
|
frame1_0_b = LD_SB(frame1_ptr);
|
||||||
|
frame2_0_b = LD_SB(frame2_ptr);
|
||||||
|
frame1_ptr += stride;
|
||||||
|
frame2_ptr += 16;
|
||||||
|
frame1_1_b = LD_SB(frame1_ptr);
|
||||||
|
frame2_1_b = LD_SB(frame2_ptr);
|
||||||
|
LD_SW2(acc, 4, acc0, acc1);
|
||||||
|
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||||
|
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||||
|
ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
|
||||||
|
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
|
||||||
|
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||||
|
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||||
|
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
|
||||||
|
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||||
|
diff0_r = (mod0_w < const16);
|
||||||
|
diff0_l = (mod1_w < const16);
|
||||||
|
diff1_r = (mod2_w < const16);
|
||||||
|
diff1_l = (mod3_w < const16);
|
||||||
|
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
mod0_w = diff0_r & mod0_w;
|
||||||
|
mod1_w = diff0_l & mod1_w;
|
||||||
|
mod2_w = diff1_r & mod2_w;
|
||||||
|
mod3_w = diff1_l & mod3_w;
|
||||||
|
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
|
||||||
|
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
|
||||||
|
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||||
|
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||||
|
cnt += 16;
|
||||||
|
ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
|
||||||
|
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
|
||||||
|
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
|
||||||
|
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
|
||||||
|
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||||
|
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
|
||||||
|
acc += 16;
|
||||||
|
LD_SW2(acc, 4, acc0, acc1);
|
||||||
|
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||||
|
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||||
|
ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
|
||||||
|
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
|
||||||
|
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||||
|
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||||
|
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
|
||||||
|
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||||
|
diff0_r = (mod0_w < const16);
|
||||||
|
diff0_l = (mod1_w < const16);
|
||||||
|
diff1_r = (mod2_w < const16);
|
||||||
|
diff1_l = (mod3_w < const16);
|
||||||
|
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
mod0_w = diff0_r & mod0_w;
|
||||||
|
mod1_w = diff0_l & mod1_w;
|
||||||
|
mod2_w = diff1_r & mod2_w;
|
||||||
|
mod3_w = diff1_l & mod3_w;
|
||||||
|
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
|
||||||
|
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||||
|
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||||
|
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||||
|
cnt += 16;
|
||||||
|
|
||||||
|
UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
|
||||||
|
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
|
||||||
|
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
|
||||||
|
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
|
||||||
|
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||||
|
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
|
||||||
|
acc += 16;
|
||||||
|
frame1_ptr += stride;
|
||||||
|
frame2_ptr += 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
|
||||||
|
uint32_t stride,
|
||||||
|
uint8_t *frame2_ptr,
|
||||||
|
int32_t strength_in,
|
||||||
|
int32_t filter_wt_in,
|
||||||
|
uint32_t *acc, uint16_t *cnt)
|
||||||
|
{
|
||||||
|
uint32_t row;
|
||||||
|
uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
|
||||||
|
v16i8 frame1 = { 0 };
|
||||||
|
v16i8 frame2 = { 0 };
|
||||||
|
v16i8 frame3 = { 0 };
|
||||||
|
v16i8 frame4 = { 0 };
|
||||||
|
v16u8 frame_l, frame_h;
|
||||||
|
v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
|
||||||
|
v8i16 diff0, diff1, cnt0, cnt1;
|
||||||
|
v4i32 const3, const16;
|
||||||
|
v4i32 filter_wt, strength;
|
||||||
|
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
|
||||||
|
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
|
||||||
|
v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
|
||||||
|
v4i32 acc0, acc1, acc2, acc3;
|
||||||
|
|
||||||
|
filter_wt = __msa_fill_w(filter_wt_in);
|
||||||
|
strength = __msa_fill_w(strength_in);
|
||||||
|
const3 = __msa_ldi_w(3);
|
||||||
|
const16 = __msa_ldi_w(16);
|
||||||
|
|
||||||
|
for (row = 2; row--;)
|
||||||
|
{
|
||||||
|
LD2(frame1_ptr, stride, f0, f1);
|
||||||
|
frame1_ptr += (2 * stride);
|
||||||
|
LD2(frame2_ptr, 8, f2, f3);
|
||||||
|
frame2_ptr += 16;
|
||||||
|
LD2(frame1_ptr, stride, f4, f5);
|
||||||
|
frame1_ptr += (2 * stride);
|
||||||
|
LD2(frame2_ptr, 8, f6, f7);
|
||||||
|
frame2_ptr += 16;
|
||||||
|
|
||||||
|
LD_SW2(acc, 4, acc0, acc1);
|
||||||
|
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||||
|
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||||
|
INSERT_D2_SB(f0, f1, frame1);
|
||||||
|
INSERT_D2_SB(f2, f3, frame2);
|
||||||
|
INSERT_D2_SB(f4, f5, frame3);
|
||||||
|
INSERT_D2_SB(f6, f7, frame4);
|
||||||
|
ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
|
||||||
|
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
|
||||||
|
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||||
|
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||||
|
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
|
||||||
|
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||||
|
diff0_r = (mod0_w < const16);
|
||||||
|
diff0_l = (mod1_w < const16);
|
||||||
|
diff1_r = (mod2_w < const16);
|
||||||
|
diff1_l = (mod3_w < const16);
|
||||||
|
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
mod0_w = diff0_r & mod0_w;
|
||||||
|
mod1_w = diff0_l & mod1_w;
|
||||||
|
mod2_w = diff1_r & mod2_w;
|
||||||
|
mod3_w = diff1_l & mod3_w;
|
||||||
|
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
|
||||||
|
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||||
|
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||||
|
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||||
|
cnt += 16;
|
||||||
|
|
||||||
|
UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
|
||||||
|
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
|
||||||
|
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
|
||||||
|
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
|
||||||
|
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||||
|
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
|
||||||
|
acc += 16;
|
||||||
|
|
||||||
|
LD_SW2(acc, 4, acc0, acc1);
|
||||||
|
LD_SW2(acc + 8, 4, acc2, acc3);
|
||||||
|
LD_SH2(cnt, 8, cnt0, cnt1);
|
||||||
|
ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
|
||||||
|
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
|
||||||
|
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
|
||||||
|
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
|
||||||
|
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
|
||||||
|
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
|
||||||
|
diff0_r = (mod0_w < const16);
|
||||||
|
diff0_l = (mod1_w < const16);
|
||||||
|
diff1_r = (mod2_w < const16);
|
||||||
|
diff1_l = (mod3_w < const16);
|
||||||
|
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
mod0_w = diff0_r & mod0_w;
|
||||||
|
mod1_w = diff0_l & mod1_w;
|
||||||
|
mod2_w = diff1_r & mod2_w;
|
||||||
|
mod3_w = diff1_l & mod3_w;
|
||||||
|
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
|
||||||
|
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
|
||||||
|
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
|
||||||
|
ST_SH2(mod0_h, mod1_h, cnt, 8);
|
||||||
|
cnt += 16;
|
||||||
|
|
||||||
|
UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
|
||||||
|
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
|
||||||
|
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
|
||||||
|
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
|
||||||
|
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
|
||||||
|
mod0_w, mod1_w, mod2_w, mod3_w);
|
||||||
|
ST_SW2(mod0_w, mod1_w, acc, 4);
|
||||||
|
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
|
||||||
|
acc += 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
|
||||||
|
uint8_t *frame2, uint32_t block_size,
|
||||||
|
int32_t strength, int32_t filter_weight,
|
||||||
|
uint32_t *accumulator, uint16_t *count)
|
||||||
|
{
|
||||||
|
if (8 == block_size)
|
||||||
|
{
|
||||||
|
temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
|
||||||
|
filter_weight, accumulator, count);
|
||||||
|
}
|
||||||
|
else if (16 == block_size)
|
||||||
|
{
|
||||||
|
temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
|
||||||
|
filter_weight, accumulator, count);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
uint32_t i, j, k;
|
||||||
|
int32_t modifier;
|
||||||
|
int32_t byte = 0;
|
||||||
|
const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
|
||||||
|
|
||||||
|
for (i = 0, k = 0; i < block_size; ++i)
|
||||||
|
{
|
||||||
|
for (j = 0; j < block_size; ++j, ++k)
|
||||||
|
{
|
||||||
|
int src_byte = frame1[byte];
|
||||||
|
int pixel_value = *frame2++;
|
||||||
|
|
||||||
|
modifier = src_byte - pixel_value;
|
||||||
|
modifier *= modifier;
|
||||||
|
modifier *= 3;
|
||||||
|
modifier += rounding;
|
||||||
|
modifier >>= strength;
|
||||||
|
|
||||||
|
if (modifier > 16)
|
||||||
|
modifier = 16;
|
||||||
|
|
||||||
|
modifier = 16 - modifier;
|
||||||
|
modifier *= filter_weight;
|
||||||
|
|
||||||
|
count[k] += modifier;
|
||||||
|
accumulator[k] += modifier * pixel_value;
|
||||||
|
|
||||||
|
byte++;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte += stride - block_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -106,5 +106,10 @@ endif
|
|||||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
|
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
|
||||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
|
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
|
||||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
|
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
|
||||||
|
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
|
||||||
|
|
||||||
|
ifeq ($(CONFIG_REALTIME_ONLY),yes)
|
||||||
|
VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
|
||||||
|
endif
|
||||||
|
|
||||||
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
|
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
|
||||||
|
Loading…
Reference in New Issue
Block a user