Replacing the 8x8 DCT with 8x8 ADST/DCT for I8x8

Fixed the code review comments. Under the htrans8x8 experiment the 8X8 DCT in the I8X8 mode is replaced with a combination of 8X8 ADST and DCT. Overall coding gains with the htrans8x8 experiment are: derf: 0.486 std-hd: 1.040 hd: 1.063 yt: 0.506 Note that part of the gain comes from bigger transforms (8x8 instead of 4x4) and part comes from replacing the DCT wth the ADST. Change-Id: I92ca6bbfce11b4165d612b81d9adfad4d010c775
2012-08-02 09:07:33 -07:00 · 2012-08-02 09:07:33 -07:00 · fcbff9ee04
commit fcbff9ee04
parent e6de9c2e5d
12 changed files with 452 additions and 91 deletions
--- a/2
+++ b/2
@ -223,8 +223,8 @@ EXPERIMENT_LIST="
    pred_filter
    lossless
    hybridtransform
+    hybridtransform8x8
    switchable_interp
-    htrans8x8
    tx16x16
 "
 CONFIG_LIST="
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@ -54,7 +54,6 @@ typedef struct {
 #define PLANE_TYPE_UV         2
 #define PLANE_TYPE_Y_WITH_DC  3

-
 typedef char ENTROPY_CONTEXT;
 typedef struct {
  ENTROPY_CONTEXT y1[4];
@ -179,6 +178,50 @@ typedef enum {
  B_MODE_COUNT
 } B_PREDICTION_MODE;

+#if CONFIG_HYBRIDTRANSFORM8X8
+// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
+static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
+  B_PREDICTION_MODE b_mode;
+  switch (mode) {
+    case DC_PRED:
+      b_mode = B_DC_PRED;
+      break;
+    case V_PRED:
+      b_mode = B_VE_PRED;
+      break;
+    case H_PRED:
+      b_mode = B_HE_PRED;
+      break;
+    case TM_PRED:
+      b_mode = B_TM_PRED;
+      break;
+    case D45_PRED:
+      b_mode = B_LD_PRED;
+      break;
+    case D135_PRED:
+      b_mode = B_RD_PRED;
+      break;
+    case D117_PRED:
+      b_mode = B_VR_PRED;
+      break;
+    case D153_PRED:
+      b_mode = B_HD_PRED;
+      break;
+    case D27_PRED:
+      b_mode = B_VL_PRED;
+      break;
+    case D63_PRED:
+      b_mode = B_HU_PRED;
+      break;
+    default :
+      // for debug purpose, to be removed after full testing
+      assert(0);
+      break;
+  }
+  return b_mode;
+}
+#endif
+
 #define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
 #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)

@ -389,6 +432,32 @@ typedef struct MacroBlockD {

 } MACROBLOCKD;

+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+// transform mapping
+static void txfm_map(BLOCKD *b, B_PREDICTION_MODE bmode) {
+  switch (bmode) {
+    case B_TM_PRED :
+    case B_RD_PRED :
+      b->bmi.as_mode.tx_type = ADST_ADST;
+      break;
+
+    case B_VE_PRED :
+    case B_VR_PRED :
+      b->bmi.as_mode.tx_type = ADST_DCT;
+      break;
+
+    case B_HE_PRED :
+    case B_HD_PRED :
+    case B_HU_PRED :
+      b->bmi.as_mode.tx_type = DCT_ADST;
+      break;
+
+    default :
+      b->bmi.as_mode.tx_type = DCT_DCT;
+      break;
+  }
+}
+#endif

 extern void vp8_build_block_doffsets(MACROBLOCKD *x);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
--- a/vp8/common/default_coef_probs.h
+++ b/vp8/common/default_coef_probs.h
@ -434,7 +434,7 @@ vp8_default_coef_probs_8x8[BLOCK_TYPES_8X8]
      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
    }
  }
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  ,
  { /* block Type 3 */
    { /* Coeff Band 0 */
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@ -60,9 +60,9 @@ extern vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
 /* Coefficients are predicted via a 3-dimensional probability table. */

 /* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-
 #define BLOCK_TYPES 4
-#if CONFIG_HTRANS8X8
+
+#if CONFIG_HYBRIDTRANSFORM8X8
 #define BLOCK_TYPES_8X8 4
 #else
 #define BLOCK_TYPES_8X8 3
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@ -35,6 +35,8 @@ static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 static const int rounding = 0;

+// TODO: these transforms can be further converted into integer forms
+//       for complexity optimization
 #if CONFIG_HYBRIDTRANSFORM
 float idct_4[16] = {
  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
@ -51,11 +53,52 @@ float iadst_4[16] = {
 };
 #endif

+#if CONFIG_HYBRIDTRANSFORM8X8
+float idct_8[64] = {
+  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,
+  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,
+  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,
+ -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,
+  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,
+ -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,
+  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,
+  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,
+  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,
+  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,
+  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,
+ -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,
+  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,
+ -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,
+  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,
+  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064
+};
+
+float iadst_8[64] = {
+  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,
+  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,
+  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,
+ -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,
+  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,
+ -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,
+  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,
+  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,
+  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,
+  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,
+  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,
+ -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,
+  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,
+ -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,
+  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
+  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
+};
+#endif
+
 #if CONFIG_HYBRIDTRANSFORM
 void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
  int i, j, k;
  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
-                            // the implementation could be simplified in conjunction with integer transform
+                            // the implementation could be simplified in
+                            // conjunction with integer transform
  short *ip = input;
  short *op = output;
  int shortpitch = pitch >> 1;
@ -158,6 +201,113 @@ void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
 }
 #endif

+#if CONFIG_HYBRIDTRANSFORM8X8
+void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+  int i, j, k;
+  float bufa[64], bufb[64]; // buffers are for floating-point test purpose
+                            // the implementation could be simplified in
+                            // conjunction with integer transform
+  short *ip = input;
+  short *op = output;
+  int shortpitch = pitch >> 1;
+
+  float *pfa = &bufa[0];
+  float *pfb = &bufb[0];
+
+  // pointers to vertical and horizontal transforms
+  float *ptv, *pth;
+
+  // load and convert residual array into floating-point
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = (float)ip[i];
+    }
+    pfa += 8;
+    ip  += 8;
+  }
+
+  // vertical transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case ADST_DCT  :
+      ptv = &iadst_8[0];
+      break;
+
+    default :
+      ptv = &idct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfb[i] = 0 ;
+      for(k = 0; k < 8; k++) {
+        pfb[i] += ptv[k] * pfa[(k<<3)];
+      }
+      pfa += 1;
+    }
+
+    pfb += 8;
+    ptv += 8;
+    pfa = &bufa[0];
+  }
+
+  // horizontal transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case  DCT_ADST :
+      pth = &iadst_8[0];
+      break;
+
+    default :
+      pth = &idct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = 0;
+      for(k = 0; k < 8; k++) {
+        pfa[i] += pfb[k] * pth[k];
+      }
+      pth += 8;
+     }
+
+    pfa += 8;
+    pfb += 8;
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = &iadst_8[0];
+        break;
+
+      default :
+        pth = &idct_8[0];
+        break;
+    }
+  }
+
+  // convert to short integer format and load BLOCKD buffer
+  op  = output;
+  pfa = &bufa[0];
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
+                             -(short)( - pfa[i] / 8 + 0.49);
+    }
+    op  += shortpitch;
+    pfa += 8;
+  }
+}
+#endif

 void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) {
  int i;
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@ -46,7 +46,6 @@ int dec_debug = 0;

 #define COEFCOUNT_TESTING

-
 static int merge_index(int v, int n, int modulus) {
  int max1 = (n - 1 - modulus / 2) / modulus + 1;
  if (v < max1) v = v * modulus + modulus / 2;
@ -260,7 +259,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
    }
  }

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
  }
@ -336,29 +335,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
    for (i = 0; i < 16; i++) {
      BLOCKD *b = &xd->block[i];
      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-      if(active_ht) {
-        switch(b_mode) {
-          case B_TM_PRED :
-          case B_RD_PRED :
-            b->bmi.as_mode.tx_type = ADST_ADST;
-            break;
-
-          case B_VE_PRED :
-          case B_VR_PRED :
-            b->bmi.as_mode.tx_type = ADST_DCT;
-            break ;
-
-          case B_HE_PRED :
-          case B_HD_PRED :
-          case B_HU_PRED :
-            b->bmi.as_mode.tx_type = DCT_ADST;
-            break;
-
-          default :
-            b->bmi.as_mode.tx_type = DCT_DCT;
-            break;
-        }
-      }
+      if(active_ht)
+        txfm_map(b, b_mode);
    } // loop over 4x4 blocks
  }
 #endif
@ -392,7 +370,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
      int i8x8mode;
      BLOCKD *b;

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
      int idx = (ib & 0x02) ? (ib + 2) : ib;

      short *q  = xd->block[idx].qcoeff;
@ -410,8 +388,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
      RECON_INVOKE(RTCD_VTABLE(recon), intra8x8_predict)
      (b, i8x8mode, b->predictor);

-#if CONFIG_HTRANS8X8
-      vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+#if CONFIG_HYBRIDTRANSFORM8X8
+      txfm_map(b, pred_mode_conv(i8x8mode));
+      vp8_ht_dequant_idct_add_8x8_c(b->bmi.as_mode.tx_type,
+                                    q, dq, pre, dst, 16, stride);
+      // vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
      q += 64;
 #else
      for (j = 0; j < 4; j++) {
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@ -79,6 +79,51 @@ void vp8_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
 }
 #endif

+#if CONFIG_HYBRIDTRANSFORM8X8
+void vp8_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
+                                   unsigned char *pred, unsigned char *dest,
+                                   int pitch, int stride) {
+  short output[64];
+  short *diff_ptr = output;
+  int b, r, c;
+  int i;
+  unsigned char *origdest = dest;
+  unsigned char *origpred = pred;
+
+  input[0] = dq[0] * input[0];
+  for (i = 1; i < 64; i++) {
+    input[i] = dq[1] * input[i];
+  }
+
+  vp8_iht8x8llm_c(input, output, 16, tx_type);
+
+  vpx_memset(input, 0, 128);
+
+  for (b = 0; b < 4; b++) {
+    for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+          a = 0;
+
+        if (a > 255)
+          a = 255;
+
+        dest[c] = (unsigned char) a;
+      }
+
+      dest += stride;
+      diff_ptr += 8;
+      pred += pitch;
+    }
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+  }
+}
+#endif
+
 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
                            unsigned char *dest, int pitch, int stride) {
  short output[16];
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@ -473,7 +473,7 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
  const int seg_active = segfeature_active(xd, segment_id, SEG_LVL_EOB);
  INT16 *qcoeff_ptr = &xd->qcoeff[0];

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED) ? 16 : 24;
  if (xd->mode_info_context->mbmi.mode != B_PRED &&
      xd->mode_info_context->mbmi.mode != SPLITMV &&
@ -506,7 +506,7 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
  else
    seg_eob = 64;

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  for (i = 0; i < bufthred ; i += 4) {
 #else
  for (i = 0; i < 24; i += 4) {
@ -528,7 +528,7 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
    qcoeff_ptr += 64;
  }

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
    type = PLANE_TYPE_UV;
    seg_eob = 16;
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@ -17,6 +17,8 @@

 #include "vp8/common/blockd.h"

+// TODO: these transforms can be converted into integer forms to reduce
+//       the complexity
 float dct_4[16] = {
  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,
  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,
@ -32,6 +34,45 @@ float adst_4[16] = {
 };
 #endif

+#if CONFIG_HYBRIDTRANSFORM8X8
+float dct_8[64] = {
+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
+  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,
+ -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,
+  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,
+ -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,
+  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,
+  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,
+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
+  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,
+ -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,
+  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,
+ -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,
+  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,
+  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064
+};
+
+float adst_8[64] = {
+  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,
+  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,
+  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,
+  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,
+  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,
+ -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,
+  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,
+  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,
+  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,
+  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,
+  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,
+ -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,
+  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,
+ -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,
+  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,
+  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532
+};
+#endif

 static const int xC1S7 = 16069;
 static const int xC2S6 = 15137;
@ -394,6 +435,112 @@ void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
 }
 #endif

+#if CONFIG_HYBRIDTRANSFORM8X8
+void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+  int i, j, k;
+  float bufa[64], bufb[64]; // buffers are for floating-point test purpose
+                             // the implementation could be simplified in
+                             // conjunction with integer transform
+  short *ip = input;
+  short *op = output;
+
+  float *pfa = &bufa[0];
+  float *pfb = &bufb[0];
+
+  // pointers to vertical and horizontal transforms
+  float *ptv, *pth;
+
+  // load and convert residual array into floating-point
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = (float)ip[i];
+    }
+    pfa += 8;
+    ip  += pitch / 2;
+  }
+
+  // vertical transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case ADST_DCT  :
+      ptv = &adst_8[0];
+      break;
+
+    default :
+      ptv = &dct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfb[i] = 0;
+      for(k = 0; k < 8; k++) {
+        pfb[i] += ptv[k] * pfa[(k<<3)];
+      }
+      pfa += 1;
+    }
+    pfb += 8;
+    ptv += 8;
+    pfa = &bufa[0];
+  }
+
+  // horizontal transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case  DCT_ADST :
+      pth = &adst_8[0];
+      break;
+
+    default :
+      pth = &dct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = 0;
+      for(k = 0; k < 8; k++) {
+        pfa[i] += pfb[k] * pth[k];
+      }
+      pth += 8;
+     }
+
+    pfa += 8;
+    pfb += 8;
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = &adst_8[0];
+        break;
+
+      default :
+        pth = &dct_8[0];
+        break;
+    }
+  }
+
+  // convert to short integer format and load BLOCKD buffer
+  op  = output ;
+  pfa = &bufa[0] ;
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
+                                   -(short)(- 8 * pfa[i] + 0.49);
+    }
+    op  += 8;
+    pfa += 8;
+  }
+}
+#endif
+
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch) {
  int i;
  int a1, b1, c1, d1;
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@ -90,28 +90,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
 #if CONFIG_HYBRIDTRANSFORM
    if(active_ht) {
      b->bmi.as_mode.test = b->bmi.as_mode.first;
-      switch(b->bmi.as_mode.first) {
-        // case B_DC_PRED :
-        case B_TM_PRED :
-        case B_RD_PRED :
-          b->bmi.as_mode.tx_type = ADST_ADST;
-          break;
-
-        case B_VE_PRED :
-        case B_VR_PRED :
-          b->bmi.as_mode.tx_type = ADST_DCT;
-          break;
-
-        case B_HE_PRED :
-        case B_HD_PRED :
-        case B_HU_PRED :
-          b->bmi.as_mode.tx_type = DCT_ADST;
-          break;
-
-        default :
-          b->bmi.as_mode.tx_type = DCT_DCT;
-          break;
-      }
+      txfm_map(b, b->bmi.as_mode.first);

      vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
      vp8_ht_quantize_b(be, b);
@ -329,16 +308,25 @@ void vp8_encode_intra8x8(const VP8_ENCODER_RTCD *rtcd,
  }
 #endif

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  {
    MACROBLOCKD *xd = &x->e_mbd;
    int idx = (ib & 0x02) ? (ib + 2) : ib;

    // generate residual blocks
    vp8_subtract_4b_c(be, b, 16);
-    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+
+    txfm_map(b, pred_mode_conv(b->bmi.as_mode.first));
+
+    vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32,
+                 b->bmi.as_mode.tx_type);
    x->quantize_b_8x8(x->block + idx, xd->block + idx);
-    vp8_short_idct8x8_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+    vp8_iht8x8llm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+                    b->bmi.as_mode.tx_type);
+
+//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+//    x->quantize_b_8x8(x->block + idx, xd->block + idx);
+//    vp8_short_idct8x8_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);

    // reconstruct submacroblock
    for (i = 0; i < 4; i++) {
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -454,7 +454,7 @@ int vp8_block_error_c(short *coeff, short *dqcoeff) {
  return error;
 }

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
 int vp8_submb_error_c(short *coeff, short *dqcoeff) {
  int i;
  int error = 0;
@ -985,28 +985,7 @@ static int64_t rd_pick_intra4x4block(
 #if CONFIG_HYBRIDTRANSFORM
      if(active_ht) {
        b->bmi.as_mode.test = mode;
-        switch(mode) {
-          // case B_DC_PRED :
-          case B_TM_PRED :
-          case B_RD_PRED :
-            b->bmi.as_mode.tx_type = ADST_ADST;
-            break;
-
-          case B_VE_PRED :
-          case B_VR_PRED :
-            b->bmi.as_mode.tx_type = ADST_DCT;
-            break;
-
-          case B_HE_PRED :
-          case B_HD_PRED :
-          case B_HU_PRED :
-            b->bmi.as_mode.tx_type = DCT_ADST;
-            break;
-
-          default :
-            b->bmi.as_mode.tx_type = DCT_DCT;
-            break;
-        }
+        txfm_map(b, mode);

        vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
        vp8_ht_quantize_b(be, b);
@ -1267,7 +1246,7 @@ static int64_t rd_pick_intra8x8block(
  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);
  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  // perform transformation of dimension 8x8
  // note the input and output index mapping
  int idx = (ib & 0x02) ? (ib + 2) : ib;
@ -1298,8 +1277,10 @@ static int64_t rd_pick_intra8x8block(

      vp8_subtract_4b_c(be, b, 16);

-#if CONFIG_HTRANS8X8
-      x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+#if CONFIG_HYBRIDTRANSFORM8X8
+      txfm_map(b, pred_mode_conv(mode));
+      vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32, b->bmi.as_mode.tx_type);
+//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
      x->quantize_b_8x8(x->block + idx, xd->block + idx);

      // compute quantization mse of 8x8 block
@ -1376,7 +1357,7 @@ static int64_t rd_pick_intra8x8block(
 #endif
  vp8_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
  *(a + vp8_block2above_8x8[idx])     = besta0;
  *(a + vp8_block2above_8x8[idx] + 1) = besta1;
  *(l + vp8_block2left_8x8 [idx])     = bestl0;
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@ -504,7 +504,7 @@ static void tokenize1st_order_ht(   MACROBLOCKD *xd,
 #endif


-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
 static void tokenize1st_order_chroma
 (
  MACROBLOCKD *xd,
@ -886,7 +886,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
      tokenize1st_order_ht(x, t, plane_type, cpi);
    } else {

-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
      if (x->mode_info_context->mbmi.mode == I8X8_PRED) {
        ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
        ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;