Flip the result of the inv transform for FLIPADST.

This is a port of 4f5108090a6047d5d4d9ce1df302da23b2ef4bc5 This commit also fixes a bug where FLIPADST transforms when combined with a DST (that is FLIPADST_DST and DST_FLIPADST) did not actually did a flipped transform but a straight ADST instead. This was due to the C implementation that it fell back on not implementing flipping. This is now fixed as well and FLIPADST_DST and DST_FLIPADST does what it is supposed to do. There are 3 functions in the SR_MODE experiment that should be updated, but given that the build of SR_MODE is broken at the upstream tip of nextgen, I could not test these, so I have put in assertions and FIXME notes at the problematic places. Change-Id: I5b8175b85f944f2369b183a26256e08d97f4bdef
2015-11-13 15:16:28 +00:00 · 2015-11-13 15:16:28 +00:00 · 85ab9d56cc
commit 85ab9d56cc
parent f1f3a8ab14
2 changed files with 424 additions and 345 deletions
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@ -16,6 +16,59 @@
 #include "vp9/common/vp9_idct.h"

 #if CONFIG_EXT_TX
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+
+static void maybe_flip_strides(uint8_t **dst, int *dstride,
+                               tran_low_t **src, int *sstride,
+                               int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case DST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_DST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
 void idst4(const tran_low_t *input, tran_low_t *output) {
  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
  static const int32_t sinvalue_lookup[] = {
@ -635,25 +688,41 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
  };

  int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;

  // inverse transform row vectors
  for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr);
+    IHT_4[tx_type].rows(input, out[i]);
    input  += 4;
-    outptr += 4;
+  }
+
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
  }

  // inverse transform column vectors
  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
    }
  }
 }
@ -756,97 +825,44 @@ static const transform_2d IHT_8[] = {
 #endif  // CONFIG_EXT_TX
 };

-#if CONFIG_EXT_TX
-void fliplr(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-void flipud(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-void fliplrud(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-void fliplr16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-void flipud16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-void fliplrud16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX
-
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                         int tx_type) {
  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const transform_2d ht = IHT_8[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;

  // inverse transform row vectors
  for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr);
-    input += 8;
-    outptr += 8;
+    IHT_8[tx_type].rows(input, out[i]);
+    input  += 8;
+  }
+
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
  }

  // inverse transform column vectors
  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
    }
  }
 }
@ -1291,26 +1307,41 @@ static const transform_2d IHT_16[] = {
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const transform_2d ht = IHT_16[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;

-  // Rows
+  // inverse transform row vectors
  for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr);
-    input += 16;
-    outptr += 16;
+    IHT_16[tx_type].rows(input, out[i]);
+    input  += 16;
  }

-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 16; ++i) {
+    IHT_16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out);
    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
    }
  }
 }
@ -1911,26 +1942,6 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 4);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 4);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 4);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_iht4x4_16_add(input, dest, stride, tx_type);
@ -1944,26 +1955,6 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 8);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 8);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 8);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_iht8x8_64_add(input, dest, stride, tx_type);
@ -1977,26 +1968,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_iht16x16_256_add_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 16);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 16);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 16);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_iht16x16_256_add(input, dest, stride, tx_type);
@ -2775,7 +2746,7 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {

 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                int stride, int tx_type, int bd) {
-  const highbd_transform_2d IHT_4[] = {
+  const highbd_transform_2d HIGH_IHT_4[] = {
    { vp9_highbd_idct4, vp9_highbd_idct4  },    // DCT_DCT  = 0
    { highbd_iadst4, vp9_highbd_idct4 },    // ADST_DCT = 1
    { vp9_highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
@ -2798,25 +2769,43 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

  int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;

-  // Inverse transform row vectors.
+  // inverse transform row vectors
  for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr, bd);
+    HIGH_IHT_4[tx_type].rows(input, out[i], bd);
    input  += 4;
-    outptr += 4;
  }

-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest,
+                     &stride, &outp, &outstride, tx_type, 4 * 2);
+#endif
+
+  // Sum with the destination
  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out, bd);
    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 4), bd);
    }
  }
 }
@ -2921,28 +2910,46 @@ static const highbd_transform_2d HIGH_IHT_8[] = {

 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // Inverse transform row vectors.
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
  for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 8;
-    outptr += 8;
+    HIGH_IHT_8[tx_type].rows(input, out[i], bd);
+    input  += 8;
  }

-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest,
+                     &stride, &outp, &outstride, tx_type, 8 * 2);
+#endif
+
+  // Sum with the destination
  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out, bd);
    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
    }
  }
 }
@ -3361,28 +3368,46 @@ static const highbd_transform_2d HIGH_IHT_16[] = {

 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
  for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 16;
-    outptr += 16;
+    HIGH_IHT_16[tx_type].rows(input, out[i], bd);
+    input  += 16;
  }

-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 16; ++i) {
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest, &stride,
+                     &outp, &outstride, tx_type, 16 * 2);
+#endif
+
+  // Sum with the destination
  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out, bd);
    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
    }
  }
 }
@ -3954,26 +3979,6 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_DCT, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, DCT_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
@ -3987,26 +3992,6 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_DCT, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, DCT_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
@ -4020,26 +4005,6 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_DCT, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, DCT_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
@ -4276,6 +4241,19 @@ void vp9_iht4x4_16_c(const tran_low_t *input, int16_t *dest, int stride,
  tran_low_t *outptr = out;
  tran_low_t temp_in[4], temp_out[4];

+  // FIXME: If the SR_MODE experiment is resurrected, then this function must
+  // be fixed to handle the FLIPADST cases by actually flipping its output
+  // See the other vp9_iht*add_c functions
+#if CONFIG_EXT_TX
+  assert(tx_type != FLIPADST_DCT);
+  assert(tx_type != DCT_FLIPADST);
+  assert(tx_type != FLIPADST_FLIPADST);
+  assert(tx_type != ADST_FLIPADST);
+  assert(tx_type != FLIPADST_ADST);
+  assert(tx_type != DST_FLIPADST);
+  assert(tx_type != FLIPADST_DST);
+#endif  // CONFIG_EXT_TX
+
  // inverse transform row vectors
  for (i = 0; i < 4; ++i) {
    IHT_4[tx_type].rows(input, outptr);
@ -4302,6 +4280,19 @@ void vp9_iht8x8_64_c(const tran_low_t *input, int16_t *dest, int stride,
  tran_low_t temp_in[8], temp_out[8];
  const transform_2d ht = IHT_8[tx_type];

+  // FIXME: If the SR_MODE experiment is resurrected, then this function must
+  // be fixed to handle the FLIPADST cases by actually flipping its output
+  // See the other vp9_iht*add_c functions
+#if CONFIG_EXT_TX
+  assert(tx_type != FLIPADST_DCT);
+  assert(tx_type != DCT_FLIPADST);
+  assert(tx_type != FLIPADST_FLIPADST);
+  assert(tx_type != ADST_FLIPADST);
+  assert(tx_type != FLIPADST_ADST);
+  assert(tx_type != DST_FLIPADST);
+  assert(tx_type != FLIPADST_DST);
+#endif  // CONFIG_EXT_TX
+
  // inverse transform row vectors
  for (i = 0; i < 8; ++i) {
    ht.rows(input, outptr);
@ -4378,6 +4369,19 @@ void vp9_iht16x16_256_c(const tran_low_t *input, int16_t *dest, int stride,
  tran_low_t temp_in[16], temp_out[16];
  const transform_2d ht = IHT_16[tx_type];

+  // FIXME: If the SR_MODE experiment is resurrected, then this function must
+  // be fixed to handle the FLIPADST cases by actually flipping its output
+  // See the other vp9_iht*add_c functions
+#if CONFIG_EXT_TX
+  assert(tx_type != FLIPADST_DCT);
+  assert(tx_type != DCT_FLIPADST);
+  assert(tx_type != FLIPADST_FLIPADST);
+  assert(tx_type != ADST_FLIPADST);
+  assert(tx_type != FLIPADST_ADST);
+  assert(tx_type != DST_FLIPADST);
+  assert(tx_type != FLIPADST_DST);
+#endif  // CONFIG_EXT_TX
+
  // Rows
  for (i = 0; i < 16; ++i) {
    ht.rows(input, outptr);
@ -4582,26 +4586,6 @@ void vp9_iht4x4(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_iht4x4_16_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 4);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 4);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 4);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_iht4x4_16(input, dest, stride, tx_type);
@ -4615,26 +4599,6 @@ void vp9_iht8x8(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_iht8x8_64_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 8);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 8);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 8);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_iht8x8_64(input, dest, stride, tx_type);
@ -4648,26 +4612,6 @@ void vp9_iht16x16(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
 #if CONFIG_EXT_TX
  } else if (is_dst_used(tx_type)) {
    vp9_iht16x16_256_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 16);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 16);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 16);
 #endif  // CONFIG_EXT_TX
  } else {
    vp9_iht16x16_256(input, dest, stride, tx_type);
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@ -11,6 +11,55 @@
 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
 #include "vp9/common/vp9_idct.h"

+#include "vp9/common/vp9_enums.h"
+
+#if CONFIG_EXT_TX
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) do {             \
+  __m128i *tmp;                                 \
+  fliplr_16x8(in0);                             \
+  fliplr_16x8(in1);                             \
+  tmp = (in0);                                  \
+  (in0) = (in1);                                \
+  (in1) = tmp;                                  \
+} while (0)
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+#endif
+
 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
@ -126,12 +175,12 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

  // Reconstruction and Store
  {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+     __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)), d2);
+     __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+     d0 = _mm_unpacklo_epi32(d0, d1);
+     d2 = _mm_unpacklo_epi32(d3, d2);
     d0 = _mm_unpacklo_epi8(d0, zero);
     d2 = _mm_unpacklo_epi8(d2, zero);
     d0 = _mm_add_epi16(d0, input2);
@ -271,22 +320,50 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct4_sse2(in);
      idct4_sse2(in);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct4_sse2(in);
      iadst4_sse2(in);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst4_sse2(in);
      idct4_sse2(in);
      break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
      iadst4_sse2(in);
      iadst4_sse2(in);
      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@ -875,22 +952,50 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct8_sse2(in);
      idct8_sse2(in);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct8_sse2(in);
      iadst8_sse2(in);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst8_sse2(in);
      idct8_sse2(in);
      break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
      iadst8_sse2(in);
      iadst8_sse2(in);
      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@ -2331,29 +2436,59 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) {

 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                               int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];

  load_buffer_8x16(input, in0);
  input += 8;
  load_buffer_8x16(input, in1);

  switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
      idct16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
      idct16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
      iadst16_sse2(in0, in1);
      idct16_sse2(in0, in1);
      break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
      iadst16_sse2(in0, in1);
      iadst16_sse2(in0, in1);
      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;