mips msa vp8 temporal filter optimization

average improvement ~2x-3x

Change-Id: I05593bed583234dc7809aaec6cab82773a29505d
This commit is contained in:
Parag Salasakar 2015-07-31 12:03:19 +05:30
parent 0e3f494b21
commit 8fbc641540
4 changed files with 401 additions and 1 deletions

View File

@ -24,6 +24,10 @@
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
@ -32,6 +36,9 @@
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
#if (__mips_isa_rev >= 6)
#define LW(psrc) \
({ \
@ -337,6 +344,17 @@
}
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
/* Description : Load 2 vectors of signed word elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
Return Type - signed word
*/
#define LD_SW2(psrc, stride, out0, out1) \
{ \
out0 = LD_SW((psrc)); \
out1 = LD_SW((psrc) + stride); \
}
/* Description : Store vectors of 16 byte elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 16 byte elements from 'in0' to (pdst)
@ -377,6 +395,17 @@
}
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
/* Description : Store vectors of word elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 4 word elements from 'in0' to (pdst)
Store 4 word elements from 'in1' to (pdst + stride)
*/
#define ST_SW2(in0, in1, pdst, stride) \
{ \
ST_SW(in0, (pdst)); \
ST_SW(in1, (pdst) + stride); \
}
/* Description : Store 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
Details : Index 'stidx' halfword element from 'in' vector is copied to
@ -1099,6 +1128,38 @@
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
/* Description : Maximum values between signed elements of vector and
5-bit signed immediate value are copied to the output vector
Arguments : Inputs - in0, in1, in2, in3, max_val
Outputs - in place operation
Return Type - unsigned halfword
Details : Maximum of signed halfword element values from 'in0' and
'max_val' are written in place
*/
#define MAXI_SH2(RTYPE, in0, in1, max_val) \
{ \
in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
}
#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
/* Description : Saturate the halfword element values to the max
unsigned value of (sat_val + 1) bits
The element data width remains unchanged
Arguments : Inputs - in0, in1, sat_val
Outputs - in place operation
Return Type - as per RTYPE
Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val + 1) bit range.
The results are written in place
*/
#define SAT_UH2(RTYPE, in0, in1, sat_val) \
{ \
in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
}
#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
/* Description : Saturate the halfword element values to the max
unsigned value of (sat_val + 1) bits
The element data width remains unchanged
@ -1323,6 +1384,29 @@
in3 = in3 >> shift; \
}
/* Description : Shift right arithmetic rounded words
Arguments : Inputs - in0, in1, shift
Outputs - in place operation
Return Type - as per RTYPE
Details : Each element of vector 'in0' is shifted right arithmetically by
the number of bits in the corresponding element in the vector
'shift'. The last discarded bit is added to shifted value for
rounding and the result is written in-place.
'shift' is a vector.
*/
#define SRAR_W2(RTYPE, in0, in1, shift) \
{ \
in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
}
#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
{ \
SRAR_W2(RTYPE, in0, in1, shift); \
SRAR_W2(RTYPE, in2, in3, shift); \
}
#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0, in1, shift
Outputs - in place operation
@ -1408,6 +1492,14 @@
out0 = in0 - in1; \
out1 = in2 - in3; \
}
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \
{ \
out0 = in0 - in1; \
out1 = in2 - in3; \
out2 = in4 - in5; \
out3 = in6 - in7; \
}
/* Description : Sign extend halfword elements from right half of the vector
Arguments : Input - in (halfword vector)

View File

@ -326,7 +326,7 @@ $vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
#
if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
specialize qw/vp8_temporal_filter_apply sse2/;
specialize qw/vp8_temporal_filter_apply sse2 msa/;
}
#

View File

@ -0,0 +1,303 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "vp8/common/mips/msa/vp8_macros_msa.h"
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
uint32_t stride,
uint8_t *frame2_ptr,
int32_t strength_in,
int32_t filter_wt_in,
uint32_t *acc, uint16_t *cnt)
{
uint32_t row;
v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
v16u8 frame_l, frame_h;
v16i8 zero = { 0 };
v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
v8i16 diff0, diff1, cnt0, cnt1;
v4i32 const3, const16, filter_wt, strength;
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
v4i32 acc0, acc1, acc2, acc3;
filter_wt = __msa_fill_w(filter_wt_in);
strength = __msa_fill_w(strength_in);
const3 = __msa_ldi_w(3);
const16 = __msa_ldi_w(16);
for (row = 8; row--;)
{
frame1_0_b = LD_SB(frame1_ptr);
frame2_0_b = LD_SB(frame2_ptr);
frame1_ptr += stride;
frame2_ptr += 16;
frame1_1_b = LD_SB(frame1_ptr);
frame2_1_b = LD_SB(frame2_ptr);
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
mod0_w, mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < const16);
diff0_l = (mod1_w < const16);
diff1_r = (mod2_w < const16);
diff1_l = (mod3_w < const16);
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
mod0_w, mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
mod0_w, mod1_w, mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
acc += 16;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
mod0_w, mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < const16);
diff0_l = (mod1_w < const16);
diff1_r = (mod2_w < const16);
diff1_l = (mod3_w < const16);
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
mod0_w, mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
mod0_w, mod1_w, mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
acc += 16;
frame1_ptr += stride;
frame2_ptr += 16;
}
}
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
uint32_t stride,
uint8_t *frame2_ptr,
int32_t strength_in,
int32_t filter_wt_in,
uint32_t *acc, uint16_t *cnt)
{
uint32_t row;
uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
v16i8 frame1 = { 0 };
v16i8 frame2 = { 0 };
v16i8 frame3 = { 0 };
v16i8 frame4 = { 0 };
v16u8 frame_l, frame_h;
v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
v8i16 diff0, diff1, cnt0, cnt1;
v4i32 const3, const16;
v4i32 filter_wt, strength;
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
v4i32 acc0, acc1, acc2, acc3;
filter_wt = __msa_fill_w(filter_wt_in);
strength = __msa_fill_w(strength_in);
const3 = __msa_ldi_w(3);
const16 = __msa_ldi_w(16);
for (row = 2; row--;)
{
LD2(frame1_ptr, stride, f0, f1);
frame1_ptr += (2 * stride);
LD2(frame2_ptr, 8, f2, f3);
frame2_ptr += 16;
LD2(frame1_ptr, stride, f4, f5);
frame1_ptr += (2 * stride);
LD2(frame2_ptr, 8, f6, f7);
frame2_ptr += 16;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
INSERT_D2_SB(f0, f1, frame1);
INSERT_D2_SB(f2, f3, frame2);
INSERT_D2_SB(f4, f5, frame3);
INSERT_D2_SB(f6, f7, frame4);
ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
mod0_w, mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < const16);
diff0_l = (mod1_w < const16);
diff1_r = (mod2_w < const16);
diff1_l = (mod3_w < const16);
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
mod0_w, mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
mod0_w, mod1_w, mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
acc += 16;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
mod0_w, mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < const16);
diff0_l = (mod1_w < const16);
diff1_r = (mod2_w < const16);
diff1_l = (mod3_w < const16);
SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
mod0_w, mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
mod0_w, mod1_w, mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
ST_SW2(mod2_w, mod3_w, acc + 8, 4);
acc += 16;
}
}
void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
uint8_t *frame2, uint32_t block_size,
int32_t strength, int32_t filter_weight,
uint32_t *accumulator, uint16_t *count)
{
if (8 == block_size)
{
temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
filter_weight, accumulator, count);
}
else if (16 == block_size)
{
temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
filter_weight, accumulator, count);
}
else
{
uint32_t i, j, k;
int32_t modifier;
int32_t byte = 0;
const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
for (i = 0, k = 0; i < block_size; ++i)
{
for (j = 0; j < block_size; ++j, ++k)
{
int src_byte = frame1[byte];
int pixel_value = *frame2++;
modifier = src_byte - pixel_value;
modifier *= modifier;
modifier *= 3;
modifier += rounding;
modifier >>= strength;
if (modifier > 16)
modifier = 16;
modifier = 16 - modifier;
modifier *= filter_weight;
count[k] += modifier;
accumulator[k] += modifier * pixel_value;
byte++;
}
byte += stride - block_size;
}
}
}

View File

@ -106,5 +106,10 @@ endif
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
endif
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))