diff --git a/src/dsp/dec_mips32.c b/src/dsp/dec_mips32.c
index 24b2b45f..385d2753 100644
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@@ -16,6 +16,8 @@
 
 #if defined(WEBP_USE_MIPS32)
 
+#include "./mips_macro.h"
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 
@@ -389,7 +391,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "sra      %[temp7],  %[temp7],  3                  \n\t"
     "sra      %[temp4],  %[temp4],  3                  \n\t"
     "addiu    %[temp6],  $zero,     255                \n\t"
-    "lbu      %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp1],  0+0*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
     "sra      %[temp5],  %[temp1],  8                  \n\t"
     "sra      %[temp18], %[temp1],  31                 \n\t"
@@ -397,8 +399,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
   "1:                                                  \n\t"
-    "lbu      %[temp18], 1(%[dst])                     \n\t"
-    "sb       %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp18], 1+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp1],  0+0*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp18], %[temp18], %[temp11]          \n\t"
     "sra      %[temp11], %[temp18], 8                  \n\t"
     "sra      %[temp1],  %[temp18], 31                 \n\t"
@@ -406,8 +408,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
   "2:                                                  \n\t"
-    "lbu      %[temp1],  2(%[dst])                     \n\t"
-    "sb       %[temp18], 1(%[dst])                     \n\t"
+    "lbu      %[temp1],  2+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp18], 1+0*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
     "sra      %[temp8],  %[temp1],  8                  \n\t"
     "sra      %[temp18], %[temp1],  31                 \n\t"
@@ -415,8 +417,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
   "3:                                                  \n\t"
-    "lbu      %[temp18], 3(%[dst])                     \n\t"
-    "sb       %[temp1],  2(%[dst])                     \n\t"
+    "lbu      %[temp18], 3+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp1],  2+0*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp18], %[temp18], %[temp16]          \n\t"
     "sra      %[temp16], %[temp18], 8                  \n\t"
     "sra      %[temp1],  %[temp18], 31                 \n\t"
@@ -424,11 +426,11 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
   "4:                                                  \n\t"
-    "sb       %[temp18], 3(%[dst])                     \n\t"
-    "lbu      %[temp5],  32(%[dst])                    \n\t"
-    "lbu      %[temp8],  33(%[dst])                    \n\t"
-    "lbu      %[temp11], 34(%[dst])                    \n\t"
-    "lbu      %[temp16], 35(%[dst])                    \n\t"
+    "sb       %[temp18], 3+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp5],  0+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp8],  1+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp11], 2+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp16], 3+1*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
     "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
     "addu     %[temp11], %[temp11], %[temp12]          \n\t"
@@ -457,14 +459,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
   "8:                                                  \n\t"
-    "sb       %[temp5],  32(%[dst])                    \n\t"
-    "sb       %[temp8],  33(%[dst])                    \n\t"
-    "sb       %[temp11], 34(%[dst])                    \n\t"
-    "sb       %[temp16], 35(%[dst])                    \n\t"
-    "lbu      %[temp5],  64(%[dst])                    \n\t"
-    "lbu      %[temp8],  65(%[dst])                    \n\t"
-    "lbu      %[temp11], 66(%[dst])                    \n\t"
-    "lbu      %[temp16], 67(%[dst])                    \n\t"
+    "sb       %[temp5],  0+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp8],  1+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp11], 2+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp16], 3+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp5],  0+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp8],  1+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp11], 2+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp16], 3+2*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
     "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
     "addu     %[temp11], %[temp11], %[temp0]           \n\t"
@@ -493,14 +495,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
   "12:                                                 \n\t"
-    "sb       %[temp5],  64(%[dst])                    \n\t"
-    "sb       %[temp8],  65(%[dst])                    \n\t"
-    "sb       %[temp11], 66(%[dst])                    \n\t"
-    "sb       %[temp16], 67(%[dst])                    \n\t"
-    "lbu      %[temp5],  96(%[dst])                    \n\t"
-    "lbu      %[temp8],  97(%[dst])                    \n\t"
-    "lbu      %[temp11], 98(%[dst])                    \n\t"
-    "lbu      %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp8],  1+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp11], 2+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp16], 3+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp5],  0+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp8],  1+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp11], 2+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp16], 3+3*"XSTR(BPS)"(%[dst])       \n\t"
     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
     "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
     "addu     %[temp11], %[temp11], %[temp4]           \n\t"
@@ -529,10 +531,10 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
   "16:                                                 \n\t"
-    "sb       %[temp5],  96(%[dst])                    \n\t"
-    "sb       %[temp8],  97(%[dst])                    \n\t"
-    "sb       %[temp11], 98(%[dst])                    \n\t"
-    "sb       %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp8],  1+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp11], 2+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp16], 3+3*"XSTR(BPS)"(%[dst])       \n\t"
 
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
diff --git a/src/dsp/dec_mips_dsp_r2.c b/src/dsp/dec_mips_dsp_r2.c
index dca15eb5..7c86dfba 100644
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@@ -27,7 +27,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
 
   __asm__ volatile (
-    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, 0, 32, 64, 96)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
     "lh               %[temp5],  0(%[in])               \n\t"
     "addiu            %[temp5],  %[temp5],  4           \n\t"
     "ins              %[temp5],  %[temp5],  16, 16      \n\t"
@@ -36,7 +39,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
                             temp3, temp1, temp2, temp3, temp4)
     STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
                      temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
-                     dst, 0, 32, 64, 96)
+                     dst, 0, 1, 2, 3, BPS)
 
     OUTPUT_EARLY_CLOBBER_REGS_10()
     : [in]"r"(in), [dst]"r"(dst)
@@ -61,14 +64,17 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
     "replv.ph         %[temp5],   %[c1]                      \n\t"
     SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
                    temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
-    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, 0, 32, 64, 96)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
     CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
                             temp11, temp17, temp3, temp5, temp11, temp12)
     PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
                           temp4, temp7, temp6, temp10, temp9)
     STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
                      temp17, temp12, temp18, temp1, temp8, temp2, temp4,
-                     temp7, temp6, dst, 0, 32, 64, 96)
+                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
 
     OUTPUT_EARLY_CLOBBER_REGS_18(),
       [c4]"+&r"(c4)
@@ -128,12 +134,15 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
                    temp6)
     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
                           temp16, temp11, temp10, temp15, temp14)
-    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, 0, 32, 64, 96)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
                             temp11, temp10, temp11, temp14, temp15)
     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
                      temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
-                     dst, 0, 32, 64, 96)
+                     dst, 0, 1, 2, 3, BPS)
 
     OUTPUT_EARLY_CLOBBER_REGS_18()
     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c
index 2f3fe981..f3121132 100644
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -17,6 +17,7 @@
 
 #if defined(WEBP_USE_MIPS32)
 
+#include "./mips_macro.h"
 #include "../enc/vp8enci.h"
 #include "../enc/cost.h"
 
@@ -59,9 +60,9 @@ static const int kC2 = 35468;
 // MUL and STORE macros inlined
 // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from ref and store to dst buffer
+// A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                     \
   "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
   "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
   "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
@@ -84,10 +85,10 @@ static const int kC2 = 35468;
   "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
   "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
   "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
-  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
+  "lbu     %[temp16],      0+"XSTR(BPS)"*"#A"(%[temp20])   \n\t"            \
+  "lbu     %[temp17],      1+"XSTR(BPS)"*"#A"(%[temp20])   \n\t"            \
+  "lbu     %[temp18],      2+"XSTR(BPS)"*"#A"(%[temp20])   \n\t"            \
+  "lbu     %[temp19],      3+"XSTR(BPS)"*"#A"(%[temp20])   \n\t"            \
   "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
   "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
   "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
@@ -110,10 +111,10 @@ static const int kC2 = 35468;
   "lw      %[temp16],      8(%[args])                      \n\t"            \
   "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
   "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+  "sb      %["#TEMP0"],    0+"XSTR(BPS)"*"#A"(%[temp16])   \n\t"            \
+  "sb      %["#TEMP4"],    1+"XSTR(BPS)"*"#A"(%[temp16])   \n\t"            \
+  "sb      %["#TEMP8"],    2+"XSTR(BPS)"*"#A"(%[temp16])   \n\t"            \
+  "sb      %["#TEMP12"],   3+"XSTR(BPS)"*"#A"(%[temp16])   \n\t"
 
 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@@ -130,10 +131,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
     VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
     VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
 
-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
+    HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
 
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@@ -253,42 +254,42 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 
 // macro for one horizontal pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
-// A..D - offsets in bytes to load from a and b buffers
+// A - offset in bytes to load from a and b buffers
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
-#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
-  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
-  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
-  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
-  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
-  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
-  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
-  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
-  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
-  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
-  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
-  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
-  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
-  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
-  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
-  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
-  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
-  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
-  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
-  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
-  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
-  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
-  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)              \
+  "lbu    %[temp0],  0+"XSTR(BPS)"*"#A"(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+"XSTR(BPS)"*"#A"(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+"XSTR(BPS)"*"#A"(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+"XSTR(BPS)"*"#A"(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+"XSTR(BPS)"*"#A"(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+"XSTR(BPS)"*"#A"(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+"XSTR(BPS)"*"#A"(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+"XSTR(BPS)"*"#A"(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]     \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]     \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]     \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]     \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]     \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]     \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]     \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]     \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]     \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]     \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]     \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]     \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]     \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]     \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]     \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]     \n\t"                \
+  "sw     %[temp7],  "#E"(%[tmp])              \n\t"                \
+  "sw     %[temp2],  "#H"(%[tmp])              \n\t"                \
+  "sw     %[temp8],  "#F"(%[tmp])              \n\t"                \
+  "sw     %[temp0],  "#G"(%[tmp])              \n\t"                \
+  "sw     %[temp1],  "#E1"(%[tmp])             \n\t"                \
+  "sw     %[temp3],  "#H1"(%[tmp])             \n\t"                \
+  "sw     %[temp6],  "#F1"(%[tmp])             \n\t"                \
+  "sw     %[temp4],  "#G1"(%[tmp])             \n\t"
 
 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@@ -370,10 +371,10 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
 
   __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
-    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
-    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
-    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
+    HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124)
     "mthi   $zero                             \n\t"
     "mtlo   $zero                             \n\t"
     VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
@@ -413,41 +414,41 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 
 // macro for one horizontal pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from src and ref buffers
+// A - offset in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
-  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
-  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
-  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
-  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
-  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
-  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
-  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
-  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
-  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
-  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)            \
+  "lw     %["#TEMP1"],  0(%[args])                       \n\t"    \
+  "lw     %["#TEMP2"],  4(%[args])                       \n\t"    \
+  "lbu    %[temp16],    0+"XSTR(BPS)"*"#A"(%["#TEMP1"])  \n\t"    \
+  "lbu    %[temp17],    0+"XSTR(BPS)"*"#A"(%["#TEMP2"])  \n\t"    \
+  "lbu    %[temp18],    1+"XSTR(BPS)"*"#A"(%["#TEMP1"])  \n\t"    \
+  "lbu    %[temp19],    1+"XSTR(BPS)"*"#A"(%["#TEMP2"])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]          \n\t"    \
+  "lbu    %[temp16],    2+"XSTR(BPS)"*"#A"(%["#TEMP1"])  \n\t"    \
+  "lbu    %[temp17],    2+"XSTR(BPS)"*"#A"(%["#TEMP2"])  \n\t"    \
+  "subu   %["#TEMP0"],  %[temp18],    %[temp19]          \n\t"    \
+  "lbu    %[temp18],    3+"XSTR(BPS)"*"#A"(%["#TEMP1"])  \n\t"    \
+  "lbu    %[temp19],    3+"XSTR(BPS)"*"#A"(%["#TEMP2"])  \n\t"    \
+  "subu   %["#TEMP1"],  %[temp16],    %[temp17]          \n\t"    \
+  "subu   %["#TEMP2"],  %[temp18],    %[temp19]          \n\t"    \
+  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]        \n\t"    \
+  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]        \n\t"    \
+  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]        \n\t"    \
+  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]        \n\t"    \
+  "mul    %[temp16],    %["#TEMP2"],  %[c5352]           \n\t"    \
+  "mul    %[temp17],    %["#TEMP2"],  %[c2217]           \n\t"    \
+  "mul    %[temp18],    %["#TEMP0"],  %[c5352]           \n\t"    \
+  "mul    %[temp19],    %["#TEMP0"],  %[c2217]           \n\t"    \
+  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]          \n\t"    \
+  "subu   %[temp20],    %["#TEMP3"],  %[temp20]          \n\t"    \
+  "sll    %["#TEMP0"],  %["#TEMP1"],  3                  \n\t"    \
+  "sll    %["#TEMP2"],  %[temp20],    3                  \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812               \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]          \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]          \n\t"    \
+  "sra    %["#TEMP1"],  %[temp16],    9                  \n\t"    \
+  "sra    %["#TEMP3"],  %[temp17],    9                  \n\t"
 
 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
@@ -491,10 +492,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
       { (const int*)src, (const int*)ref, (const int*)out };
 
   __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
     "lw   %[temp20],    8(%[args])                     \n\t"
     VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
     VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
@@ -661,22 +662,22 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
-     GET_SSE(128, 132, 136, 140)
-     GET_SSE(144, 148, 152, 156)
-     GET_SSE(160, 164, 168, 172)
-     GET_SSE(176, 180, 184, 188)
-     GET_SSE(192, 196, 200, 204)
-     GET_SSE(208, 212, 216, 220)
-     GET_SSE(224, 228, 232, 236)
-     GET_SSE(240, 244, 248, 252)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
@@ -695,14 +696,14 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
@@ -721,10 +722,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE( 0,   4,  16,  20)
-     GET_SSE(32,  36,  48,  52)
-     GET_SSE(64,  68,  80,  84)
-     GET_SSE(96, 100, 112, 116)
+     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
@@ -743,7 +744,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE(0, 16, 32, 48)
+     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
@@ -769,26 +770,17 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
-  // TODO(djordje): fix these to use generic BPS instead of hardcoded value 16
-  (void)ITransform;
-  (void)FTransform;
-  (void)Disto4x4;
-  (void)Disto16x16;
-//  VP8ITransform = ITransform;
-//  VP8FTransform = FTransform;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8EncQuantize2Blocks = Quantize2Blocks;
-//  VP8TDisto4x4 = Disto4x4;
-//  VP8TDisto16x16 = Disto16x16;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
 #if !defined(WORK_AROUND_GCC)
-  (void)SSE16x16;
-  (void)SSE8x8;
-  (void)SSE16x8;
-  (void)SSE4x4;
-//  VP8SSE16x16 = SSE16x16;
-//  VP8SSE8x8 = SSE8x8;
-//  VP8SSE16x8 = SSE16x8;
-//  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
 #endif
 #endif  // WEBP_USE_MIPS32
 }
diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c
index eeac7be4..bc015a9d 100644
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -77,37 +77,37 @@ static const int kC2 = 35468;
 // A - offset in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
-  "lw              %["#TEMP0"],   0(%[args])                  \n\t"            \
-  "lw              %["#TEMP1"],   4(%[args])                  \n\t"            \
-  "lw              %["#TEMP2"],   "#A"(%["#TEMP0"])           \n\t"            \
-  "lw              %["#TEMP3"],   "#A"(%["#TEMP1"])           \n\t"            \
-  "preceu.ph.qbl   %["#TEMP0"],   %["#TEMP2"]                 \n\t"            \
-  "preceu.ph.qbl   %["#TEMP1"],   %["#TEMP3"]                 \n\t"            \
-  "preceu.ph.qbr   %["#TEMP2"],   %["#TEMP2"]                 \n\t"            \
-  "preceu.ph.qbr   %["#TEMP3"],   %["#TEMP3"]                 \n\t"            \
-  "subq.ph         %["#TEMP0"],   %["#TEMP0"],   %["#TEMP1"]  \n\t"            \
-  "subq.ph         %["#TEMP2"],   %["#TEMP2"],   %["#TEMP3"]  \n\t"            \
-  "rotr            %["#TEMP0"],   %["#TEMP0"],   16           \n\t"            \
-  "addq.ph         %["#TEMP1"],   %["#TEMP2"],   %["#TEMP0"]  \n\t"            \
-  "subq.ph         %["#TEMP3"],   %["#TEMP2"],   %["#TEMP0"]  \n\t"            \
-  "seh             %["#TEMP0"],   %["#TEMP1"]                 \n\t"            \
-  "sra             %[temp16],     %["#TEMP1"],   16           \n\t"            \
-  "seh             %[temp19],     %["#TEMP3"]                 \n\t"            \
-  "sra             %["#TEMP3"],   %["#TEMP3"],   16           \n\t"            \
-  "subu            %["#TEMP2"],   %["#TEMP0"],   %[temp16]    \n\t"            \
-  "addu            %["#TEMP0"],   %["#TEMP0"],   %[temp16]    \n\t"            \
-  "mul             %[temp17],     %[temp19],     %[c2217]     \n\t"            \
-  "mul             %[temp18],     %["#TEMP3"],   %[c5352]     \n\t"            \
-  "mul             %["#TEMP1"],   %[temp19],     %[c5352]     \n\t"            \
-  "mul             %[temp16],     %["#TEMP3"],   %[c2217]     \n\t"            \
-  "sll             %["#TEMP2"],   %["#TEMP2"],   3            \n\t"            \
-  "sll             %["#TEMP0"],   %["#TEMP0"],   3            \n\t"            \
-  "subu            %["#TEMP3"],   %[temp17],     %[temp18]    \n\t"            \
-  "addu            %["#TEMP1"],   %[temp16],     %["#TEMP1"]  \n\t"            \
-  "addiu           %["#TEMP3"],   %["#TEMP3"],   937          \n\t"            \
-  "addiu           %["#TEMP1"],   %["#TEMP1"],   1812         \n\t"            \
-  "sra             %["#TEMP3"],   %["#TEMP3"],   9            \n\t"            \
-  "sra             %["#TEMP1"],   %["#TEMP1"],   9            \n\t"
+  "lw              %["#TEMP0"],   0(%[args])                        \n\t"      \
+  "lw              %["#TEMP1"],   4(%[args])                        \n\t"      \
+  "lw              %["#TEMP2"],   "XSTR(BPS)"*"#A"(%["#TEMP0"])     \n\t"      \
+  "lw              %["#TEMP3"],   "XSTR(BPS)"*"#A"(%["#TEMP1"])     \n\t"      \
+  "preceu.ph.qbl   %["#TEMP0"],   %["#TEMP2"]                       \n\t"      \
+  "preceu.ph.qbl   %["#TEMP1"],   %["#TEMP3"]                       \n\t"      \
+  "preceu.ph.qbr   %["#TEMP2"],   %["#TEMP2"]                       \n\t"      \
+  "preceu.ph.qbr   %["#TEMP3"],   %["#TEMP3"]                       \n\t"      \
+  "subq.ph         %["#TEMP0"],   %["#TEMP0"],   %["#TEMP1"]        \n\t"      \
+  "subq.ph         %["#TEMP2"],   %["#TEMP2"],   %["#TEMP3"]        \n\t"      \
+  "rotr            %["#TEMP0"],   %["#TEMP0"],   16                 \n\t"      \
+  "addq.ph         %["#TEMP1"],   %["#TEMP2"],   %["#TEMP0"]        \n\t"      \
+  "subq.ph         %["#TEMP3"],   %["#TEMP2"],   %["#TEMP0"]        \n\t"      \
+  "seh             %["#TEMP0"],   %["#TEMP1"]                       \n\t"      \
+  "sra             %[temp16],     %["#TEMP1"],   16                 \n\t"      \
+  "seh             %[temp19],     %["#TEMP3"]                       \n\t"      \
+  "sra             %["#TEMP3"],   %["#TEMP3"],   16                 \n\t"      \
+  "subu            %["#TEMP2"],   %["#TEMP0"],   %[temp16]          \n\t"      \
+  "addu            %["#TEMP0"],   %["#TEMP0"],   %[temp16]          \n\t"      \
+  "mul             %[temp17],     %[temp19],     %[c2217]           \n\t"      \
+  "mul             %[temp18],     %["#TEMP3"],   %[c5352]           \n\t"      \
+  "mul             %["#TEMP1"],   %[temp19],     %[c5352]           \n\t"      \
+  "mul             %[temp16],     %["#TEMP3"],   %[c2217]           \n\t"      \
+  "sll             %["#TEMP2"],   %["#TEMP2"],   3                  \n\t"      \
+  "sll             %["#TEMP0"],   %["#TEMP0"],   3                  \n\t"      \
+  "subu            %["#TEMP3"],   %[temp17],     %[temp18]          \n\t"      \
+  "addu            %["#TEMP1"],   %[temp16],     %["#TEMP1"]        \n\t"      \
+  "addiu           %["#TEMP3"],   %["#TEMP3"],   937                \n\t"      \
+  "addiu           %["#TEMP1"],   %["#TEMP1"],   1812               \n\t"      \
+  "sra             %["#TEMP3"],   %["#TEMP3"],   9                  \n\t"      \
+  "sra             %["#TEMP1"],   %["#TEMP1"],   9                  \n\t"
 
 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
@@ -151,10 +151,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
       { (const int*)src, (const int*)ref, (const int*)out };
 
   __asm__ volatile (
-    HORIZONTAL_PASS( 0, temp0, temp1, temp2, temp3)
-    HORIZONTAL_PASS(16, temp4, temp5, temp6, temp7)
-    HORIZONTAL_PASS(32, temp8, temp9, temp10, temp11)
-    HORIZONTAL_PASS(48, temp12, temp13, temp14, temp15)
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
     "lw            %[temp20],     8(%[args])                  \n\t"
     VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
     VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
@@ -222,12 +222,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                    temp6)
     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
                           temp16, temp11, temp10, temp15, temp14)
-    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref, 0, 16, 32, 48)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
                             temp11, temp10, temp11, temp14, temp15)
     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
                      temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
-                     dst, 0, 16, 32, 48)
+                     dst, 0, 1, 2, 3, BPS)
 
     OUTPUT_EARLY_CLOBBER_REGS_18()
     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
@@ -249,7 +252,10 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
 
   __asm__ volatile (
-    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a, 0, 16, 32, 48)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
                             temp12, temp1, temp2, temp3, temp4)
     ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
@@ -263,11 +269,20 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
     ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
                       temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
     ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
-    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, 0, 4, 8, 12)
-    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, 16, 20, 24, 28)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
     MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
              temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
-    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b, 0, 16, 32, 48)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
     CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
                             temp12, temp1, temp2, temp3, temp4)
     ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
@@ -281,8 +296,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
     ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
                       temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
     ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
-    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, 0, 4, 8, 12)
-    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, 16, 20, 24, 28)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
     MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
              temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
     OUTPUT_EARLY_CLOBBER_REGS_17()
@@ -318,14 +339,9 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
 #if defined(WEBP_USE_MIPS_DSP_R2)
-  // TODO(djordje): fix these to use generic BPS instead of hardcoded value
-  (void)ITransform;
-  (void)FTransform;
-  (void)Disto4x4;
-  (void)Disto16x16;
-//  VP8FTransform = FTransform;
-//  VP8ITransform = ITransform;
-//  VP8TDisto4x4 = Disto4x4;
-//  VP8TDisto16x16 = Disto16x16;
+  VP8FTransform = FTransform;
+  VP8ITransform = ITransform;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
 #endif  // WEBP_USE_MIPS_DSP_R2
 }
diff --git a/src/dsp/mips_macro.h b/src/dsp/mips_macro.h
index 6d907afd..3ce4ac21 100644
--- a/src/dsp/mips_macro.h
+++ b/src/dsp/mips_macro.h
@@ -12,6 +12,9 @@
 #ifndef WEBP_DSP_MIPS_MACRO_H_
 #define WEBP_DSP_MIPS_MACRO_H_
 
+#define STR(s) #s
+#define XSTR(s) STR(s)
+
 // O0[31..16 | 15..0] = I0[31..16 | 15..0] + I1[31..16 | 15..0]
 // O1[31..16 | 15..0] = I0[31..16 | 15..0] - I1[31..16 | 15..0]
 // O - output
@@ -30,13 +33,13 @@
   "lh               %["#O1"],   "#I1"(%[in])                  \n\t"
 
 // I0 - location
-// I1..I4 - offsets in bytes
+// I1..I9 - offsets in bytes
 #define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3,                                    \
-                            I0, I1, I2, I3, I4)                                \
-  "ulw    %["#O0"],    "#I1"(%["#I0"])                        \n\t"            \
-  "ulw    %["#O1"],    "#I2"(%["#I0"])                        \n\t"            \
-  "ulw    %["#O2"],    "#I3"(%["#I0"])                        \n\t"            \
-  "ulw    %["#O3"],    "#I4"(%["#I0"])                        \n\t"
+                            I0, I1, I2, I3, I4, I5, I6, I7, I8, I9)            \
+  "ulw    %["#O0"],    "#I1"+"XSTR(I9)"*"#I5"(%["#I0"])       \n\t"            \
+  "ulw    %["#O1"],    "#I2"+"XSTR(I9)"*"#I6"(%["#I0"])       \n\t"            \
+  "ulw    %["#O2"],    "#I3"+"XSTR(I9)"*"#I7"(%["#I0"])       \n\t"            \
+  "ulw    %["#O3"],    "#I4"+"XSTR(I9)"*"#I8"(%["#I0"])       \n\t"
 
 // O - output
 // IO - input/output
@@ -152,7 +155,7 @@
 // I - input (macro doesn't change it)
 #define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
                          I0, I1, I2, I3, I4, I5, I6, I7,                       \
-                         I8, I9, I10, I11, I12)                                \
+                         I8, I9, I10, I11, I12, I13)                           \
   "addq.ph          %["#IO0"],  %["#IO0"],  %["#I0"]          \n\t"            \
   "addq.ph          %["#IO1"],  %["#IO1"],  %["#I1"]          \n\t"            \
   "addq.ph          %["#IO2"],  %["#IO2"],  %["#I2"]          \n\t"            \
@@ -173,10 +176,10 @@
   "precrqu_s.qb.ph  %["#IO2"],  %["#IO3"],  %["#IO2"]         \n\t"            \
   "precrqu_s.qb.ph  %["#IO4"],  %["#IO5"],  %["#IO4"]         \n\t"            \
   "precrqu_s.qb.ph  %["#IO6"],  %["#IO7"],  %["#IO6"]         \n\t"            \
-  "usw              %["#IO0"],  "#I9"(%["#I8"])               \n\t"            \
-  "usw              %["#IO2"],  "#I10"(%["#I8"])              \n\t"            \
-  "usw              %["#IO4"],  "#I11"(%["#I8"])              \n\t"            \
-  "usw              %["#IO6"],  "#I12"(%["#I8"])              \n\t"
+  "usw              %["#IO0"],  "XSTR(I13)"*"#I9"(%["#I8"])   \n\t"            \
+  "usw              %["#IO2"],  "XSTR(I13)"*"#I10"(%["#I8"])  \n\t"            \
+  "usw              %["#IO4"],  "XSTR(I13)"*"#I11"(%["#I8"])  \n\t"            \
+  "usw              %["#IO6"],  "XSTR(I13)"*"#I12"(%["#I8"])  \n\t"
 
 #define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
   : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \