From 8fbc641540d78000be05a5b0357d9b04e1cb2132 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Fri, 31 Jul 2015 12:03:19 +0530 Subject: [PATCH] mips msa vp8 temporal filter optimization average improvement ~2x-3x Change-Id: I05593bed583234dc7809aaec6cab82773a29505d --- vp8/common/mips/msa/vp8_macros_msa.h | 92 +++++++ vp8/common/rtcd_defs.pl | 2 +- vp8/encoder/mips/msa/temporal_filter_msa.c | 303 +++++++++++++++++++++ vp8/vp8cx.mk | 5 + 4 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 vp8/encoder/mips/msa/temporal_filter_msa.c diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 0486348be..27d592995 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -24,6 +24,10 @@ #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) @@ -32,6 +36,9 @@ #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + #if (__mips_isa_rev >= 6) #define LW(psrc) \ ({ \ @@ -337,6 +344,17 @@ } #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) \ +{ \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ +} + /* Description : Store vectors of 16 byte elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 16 byte elements from 'in0' to (pdst) @@ -377,6 +395,17 @@ } #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ +{ \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ +} + /* Description : Store 2x4 byte block to destination memory from input vector Arguments : Inputs - in, stidx, pdst, stride Details : Index 'stidx' halfword element from 'in' vector is copied to @@ -1099,6 +1128,38 @@ #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) +/* Description : Maximum values between signed elements of vector and + 5-bit signed immediate value are copied to the output vector + Arguments : Inputs - in0, in1, in2, in3, max_val + Outputs - in place operation + Return Type - unsigned halfword + Details : Maximum of signed halfword element values from 'in0' and + 'max_val' are written in place +*/ +#define MAXI_SH2(RTYPE, in0, in1, max_val) \ +{ \ + in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \ + in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \ +} +#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ +{ \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ +} +#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) + /* Description : Saturate the halfword element values to the max unsigned value of (sat_val + 1) bits The element data width remains unchanged @@ -1323,6 +1384,29 @@ in3 = in3 >> shift; \ } +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ +{ \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ +} + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ +{ \ + SRAR_W2(RTYPE, in0, in1, shift); \ + SRAR_W2(RTYPE, in2, in3, shift); \ +} +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + /* Description : Shift right arithmetic rounded (immediate) Arguments : Inputs - in0, in1, shift Outputs - in place operation @@ -1408,6 +1492,14 @@ out0 = in0 - in1; \ out1 = in2 - in3; \ } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) \ +{ \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ +} /* Description : Sign extend halfword elements from right half of the vector Arguments : Input - in (halfword vector) diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 8261dd2ae..6fe070b42 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -326,7 +326,7 @@ $vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4; # if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"; - specialize qw/vp8_temporal_filter_apply sse2/; + specialize qw/vp8_temporal_filter_apply sse2 msa/; } # diff --git a/vp8/encoder/mips/msa/temporal_filter_msa.c b/vp8/encoder/mips/msa/temporal_filter_msa.c new file mode 100644 index 000000000..5cca5e087 --- /dev/null +++ b/vp8/encoder/mips/msa/temporal_filter_msa.c @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr, + uint32_t stride, + uint8_t *frame2_ptr, + int32_t strength_in, + int32_t filter_wt_in, + uint32_t *acc, uint16_t *cnt) +{ + uint32_t row; + v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; + v16u8 frame_l, frame_h; + v16i8 zero = { 0 }; + v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; + v8i16 diff0, diff1, cnt0, cnt1; + v4i32 const3, const16, filter_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frame2_0, frame2_1, frame2_2, frame2_3; + v4i32 acc0, acc1, acc2, acc3; + + filter_wt = __msa_fill_w(filter_wt_in); + strength = __msa_fill_w(strength_in); + const3 = __msa_ldi_w(3); + const16 = __msa_ldi_w(16); + + for (row = 8; row--;) + { + frame1_0_b = LD_SB(frame1_ptr); + frame2_0_b = LD_SB(frame2_ptr); + frame1_ptr += stride; + frame2_ptr += 16; + frame1_1_b = LD_SB(frame1_ptr); + frame2_1_b = LD_SB(frame2_ptr); + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, + frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, + frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + frame1_ptr += stride; + frame2_ptr += 16; + } +} + +static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr, + uint32_t stride, + uint8_t *frame2_ptr, + int32_t strength_in, + int32_t filter_wt_in, + uint32_t *acc, uint16_t *cnt) +{ + uint32_t row; + uint64_t f0, f1, f2, f3, f4, f5, f6, f7; + v16i8 frame1 = { 0 }; + v16i8 frame2 = { 0 }; + v16i8 frame3 = { 0 }; + v16i8 frame4 = { 0 }; + v16u8 frame_l, frame_h; + v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; + v8i16 diff0, diff1, cnt0, cnt1; + v4i32 const3, const16; + v4i32 filter_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frame2_0, frame2_1, frame2_2, frame2_3; + v4i32 acc0, acc1, acc2, acc3; + + filter_wt = __msa_fill_w(filter_wt_in); + strength = __msa_fill_w(strength_in); + const3 = __msa_ldi_w(3); + const16 = __msa_ldi_w(16); + + for (row = 2; row--;) + { + LD2(frame1_ptr, stride, f0, f1); + frame1_ptr += (2 * stride); + LD2(frame2_ptr, 8, f2, f3); + frame2_ptr += 16; + LD2(frame1_ptr, stride, f4, f5); + frame1_ptr += (2 * stride); + LD2(frame2_ptr, 8, f6, f7); + frame2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + INSERT_D2_SB(f0, f1, frame1); + INSERT_D2_SB(f2, f3, frame2); + INSERT_D2_SB(f4, f5, frame3); + INSERT_D2_SB(f6, f7, frame4); + ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, + frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); + HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + diff0_r = (mod0_w < const16); + diff0_l = (mod1_w < const16); + diff1_r = (mod2_w < const16); + diff1_l = (mod3_w < const16); + SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, + filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); + UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); + UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); + MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, + frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + ST_SW2(mod2_w, mod3_w, acc + 8, 4); + acc += 16; + } +} + +void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride, + uint8_t *frame2, uint32_t block_size, + int32_t strength, int32_t filter_weight, + uint32_t *accumulator, uint16_t *count) +{ + if (8 == block_size) + { + temporal_filter_apply_8size_msa(frame1, stride, frame2, strength, + filter_weight, accumulator, count); + } + else if (16 == block_size) + { + temporal_filter_apply_16size_msa(frame1, stride, frame2, strength, + filter_weight, accumulator, count); + } + else + { + uint32_t i, j, k; + int32_t modifier; + int32_t byte = 0; + const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0; + + for (i = 0, k = 0; i < block_size; ++i) + { + for (j = 0; j < block_size; ++j, ++k) + { + int src_byte = frame1[byte]; + int pixel_value = *frame2++; + + modifier = src_byte - pixel_value; + modifier *= modifier; + modifier *= 3; + modifier += rounding; + modifier >>= strength; + + if (modifier > 16) + modifier = 16; + + modifier = 16 - modifier; + modifier *= filter_weight; + + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + + byte++; + } + + byte += stride - block_size; + } + } +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 25d3d9ff4..1bafbfea0 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -106,5 +106,10 @@ endif VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c + +ifeq ($(CONFIG_REALTIME_ONLY),yes) +VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c +endif VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))