From 8fbc641540d78000be05a5b0357d9b04e1cb2132 Mon Sep 17 00:00:00 2001
From: Parag Salasakar <img.mips1@gmail.com>
Date: Fri, 31 Jul 2015 12:03:19 +0530
Subject: [PATCH] mips msa vp8 temporal filter optimization

average improvement ~2x-3x

Change-Id: I05593bed583234dc7809aaec6cab82773a29505d
---
 vp8/common/mips/msa/vp8_macros_msa.h       |  92 +++++++
 vp8/common/rtcd_defs.pl                    |   2 +-
 vp8/encoder/mips/msa/temporal_filter_msa.c | 303 +++++++++++++++++++++
 vp8/vp8cx.mk                               |   5 +
 4 files changed, 401 insertions(+), 1 deletion(-)
 create mode 100644 vp8/encoder/mips/msa/temporal_filter_msa.c

diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 0486348be..27d592995 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -24,6 +24,10 @@
 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
 
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
@@ -32,6 +36,9 @@
 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
 
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
 #if (__mips_isa_rev >= 6)
 #define LW(psrc)                                      \
 ({                                                    \
@@ -337,6 +344,17 @@
 }
 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1)  \
+{                                         \
+    out0 = LD_SW((psrc));                 \
+    out1 = LD_SW((psrc) + stride);        \
+}
+
 /* Description : Store vectors of 16 byte elements with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 16 byte elements from 'in0' to (pdst)
@@ -377,6 +395,17 @@
 }
 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride)  \
+{                                       \
+    ST_SW(in0, (pdst));                 \
+    ST_SW(in1, (pdst) + stride);        \
+}
+
 /* Description : Store 2x4 byte block to destination memory from input vector
    Arguments   : Inputs - in, stidx, pdst, stride
    Details     : Index 'stidx' halfword element from 'in' vector is copied to
@@ -1099,6 +1128,38 @@
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 
+/* Description : Maximum values between signed elements of vector and
+                 5-bit signed immediate value are copied to the output vector
+   Arguments   : Inputs  - in0, in1, in2, in3, max_val
+                 Outputs - in place operation
+                 Return Type - unsigned halfword
+   Details     : Maximum of signed halfword element values from 'in0' and
+                 'max_val' are written in place
+*/
+#define MAXI_SH2(RTYPE, in0, in1, max_val)               \
+{                                                        \
+    in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val));  \
+    in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val));  \
+}
+#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val)             \
+{                                                     \
+    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
+    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
+}
+#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
+
 /* Description : Saturate the halfword element values to the max
                  unsigned value of (sat_val + 1) bits
                  The element data width remains unchanged
@@ -1323,6 +1384,29 @@
     in3 = in3 >> shift;                    \
 }
 
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift)                   \
+{                                                         \
+    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
+    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                  \
+    SRAR_W2(RTYPE, in0, in1, shift);               \
+    SRAR_W2(RTYPE, in2, in3, shift);               \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
 /* Description : Shift right arithmetic rounded (immediate)
    Arguments   : Inputs  - in0, in1, shift
                  Outputs - in place operation
@@ -1408,6 +1492,14 @@
     out0 = in0 - in1;                         \
     out1 = in2 - in3;                         \
 }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    out0 = in0 - in1;                                 \
+    out1 = in2 - in3;                                 \
+    out2 = in4 - in5;                                 \
+    out3 = in6 - in7;                                 \
+}
 
 /* Description : Sign extend halfword elements from right half of the vector
    Arguments   : Input  - in    (halfword vector)
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 8261dd2ae..6fe070b42 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -326,7 +326,7 @@ $vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
 #
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
-    specialize qw/vp8_temporal_filter_apply sse2/;
+    specialize qw/vp8_temporal_filter_apply sse2 msa/;
 }
 
 #
diff --git a/vp8/encoder/mips/msa/temporal_filter_msa.c b/vp8/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 000000000..5cca5e087
--- /dev/null
+++ b/vp8/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,303 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frame2_ptr,
+                                             int32_t strength_in,
+                                             int32_t filter_wt_in,
+                                             uint32_t *acc, uint16_t *cnt)
+{
+    uint32_t row;
+    v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
+    v16u8 frame_l, frame_h;
+    v16i8 zero = { 0 };
+    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+    v8i16 diff0, diff1, cnt0, cnt1;
+    v4i32 const3, const16, filter_wt, strength;
+    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+    v4i32 acc0, acc1, acc2, acc3;
+
+    filter_wt = __msa_fill_w(filter_wt_in);
+    strength = __msa_fill_w(strength_in);
+    const3 = __msa_ldi_w(3);
+    const16 = __msa_ldi_w(16);
+
+    for (row = 8; row--;)
+    {
+        frame1_0_b = LD_SB(frame1_ptr);
+        frame2_0_b = LD_SB(frame2_ptr);
+        frame1_ptr += stride;
+        frame2_ptr += 16;
+        frame1_1_b = LD_SB(frame1_ptr);
+        frame2_1_b = LD_SB(frame2_ptr);
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+        ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+        frame1_ptr += stride;
+        frame2_ptr += 16;
+    }
+}
+
+static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frame2_ptr,
+                                            int32_t strength_in,
+                                            int32_t filter_wt_in,
+                                            uint32_t *acc, uint16_t *cnt)
+{
+    uint32_t row;
+    uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
+    v16i8 frame1 = { 0 };
+    v16i8 frame2 = { 0 };
+    v16i8 frame3 = { 0 };
+    v16i8 frame4 = { 0 };
+    v16u8 frame_l, frame_h;
+    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+    v8i16 diff0, diff1, cnt0, cnt1;
+    v4i32 const3, const16;
+    v4i32 filter_wt, strength;
+    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+    v4i32 acc0, acc1, acc2, acc3;
+
+    filter_wt = __msa_fill_w(filter_wt_in);
+    strength = __msa_fill_w(strength_in);
+    const3 = __msa_ldi_w(3);
+    const16 = __msa_ldi_w(16);
+
+    for (row = 2; row--;)
+    {
+        LD2(frame1_ptr, stride, f0, f1);
+        frame1_ptr += (2 * stride);
+        LD2(frame2_ptr, 8, f2, f3);
+        frame2_ptr += 16;
+        LD2(frame1_ptr, stride, f4, f5);
+        frame1_ptr += (2 * stride);
+        LD2(frame2_ptr, 8, f6, f7);
+        frame2_ptr += 16;
+
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        INSERT_D2_SB(f0, f1, frame1);
+        INSERT_D2_SB(f2, f3, frame2);
+        INSERT_D2_SB(f4, f5, frame3);
+        INSERT_D2_SB(f6, f7, frame4);
+        ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+    }
+}
+
+void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
+                                   uint8_t *frame2, uint32_t block_size,
+                                   int32_t strength,  int32_t filter_weight,
+                                   uint32_t *accumulator, uint16_t *count)
+{
+    if (8 == block_size)
+    {
+        temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
+                                        filter_weight, accumulator, count);
+    }
+    else if (16 == block_size)
+    {
+        temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
+                                         filter_weight, accumulator, count);
+    }
+    else
+    {
+        uint32_t i, j, k;
+        int32_t modifier;
+        int32_t byte = 0;
+        const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+        for (i = 0, k = 0; i < block_size; ++i)
+        {
+            for (j = 0; j < block_size; ++j, ++k)
+            {
+                int src_byte = frame1[byte];
+                int pixel_value = *frame2++;
+
+                modifier = src_byte - pixel_value;
+                modifier *= modifier;
+                modifier *= 3;
+                modifier += rounding;
+                modifier >>= strength;
+
+                if (modifier > 16)
+                    modifier = 16;
+
+                modifier = 16 - modifier;
+                modifier *= filter_weight;
+
+                count[k] += modifier;
+                accumulator[k] += modifier * pixel_value;
+
+                byte++;
+            }
+
+            byte += stride - block_size;
+        }
+    }
+}
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 25d3d9ff4..1bafbfea0 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -106,5 +106,10 @@ endif
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+endif
 
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))