WIP: 32x32 idct/recon merge

This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: I4ea09df0e162591e420d869b7431c2e7f89a8c1a
2013-05-14 11:58:13 -04:00 · 2013-05-14 11:58:13 -04:00 · 2cf0d4be12
commit 2cf0d4be12
parent 1f26840fbf
7 changed files with 115 additions and 141 deletions
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -18,7 +18,7 @@ extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }

 #include "test/acm_random.h"
@ -91,28 +91,31 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
  }
 }

-
 TEST(VP9Idct32x32Test, AccuracyCheck) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  const int count_test_block = 1000;
  for (int i = 0; i < count_test_block; ++i) {
    int16_t in[1024], coeff[1024];
-    int16_t out_c[1024];
+    uint8_t dst[1024], src[1024];
    double out_r[1024];

+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 1024; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];

    reference_32x32_dct_2d(in, out_r);
    for (int j = 0; j < 1024; j++)
      coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_c(coeff, out_c, 64);
+    vp9_short_idct32x32_add_c(coeff, dst, 32);
    for (int j = 0; j < 1024; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      EXPECT_GE(1, error)
-          << "Error: 3x32 IDCT has error " << error
+          << "Error: 32x32 IDCT has error " << error
          << " at index " << j;
    }
  }
@ -126,18 +129,22 @@ TEST(VP9Fdct32x32Test, AccuracyCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int16_t test_input_block[1024];
    int16_t test_temp_block[1024];
-    int16_t test_output_block[1024];
+    uint8_t dst[1024], src[1024];

+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];

    const int pitch = 64;
    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);

    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned diff = dst[j] - src[j];
      const unsigned error = diff * diff;
      if (max_error < error)
        max_error = error;
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@ -1249,10 +1249,9 @@ static void idct32_1d(int16_t *input, int16_t *output) {
  output[31] = step1[0] - step1[31];
 }

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
  int16_t out[32 * 32];
  int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
  int i, j;
  int16_t temp_in[32], temp_out[32];

@ -1269,7 +1268,8 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
      temp_in[j] = out[j * 32 + i];
    idct32_1d(temp_in, temp_out);
    for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
  }
 }

@ -1279,10 +1279,10 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
  output[0] = ROUND_POWER_OF_TWO(out, 6);
 }

-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
  int16_t out[32 * 32];
  int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
  int i, j;
  int16_t temp_in[32], temp_out[32];

@ -1302,6 +1302,7 @@ void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
      temp_in[j] = out[j * 32 + i];
    idct32_1d(temp_in, temp_out);
    for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
  }
 }
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -94,9 +94,6 @@ specialize vp9_add_residual_8x8 sse2
 prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_16x16 sse2

-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2

@ -212,15 +209,14 @@ specialize vp9_short_idct10_16x16 sse2
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16

-
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2

 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32

-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add

 prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht8x8
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@ -1319,8 +1319,7 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
  }
 }

-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i final_rounding = _mm_set1_epi16(1<<5);

@ -1832,6 +1831,8 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
    } else {
+      const __m128i zero = _mm_setzero_si128();
+
      // 2_D: Calculate the results and store them to destination.
      in0 = _mm_add_epi16(stp1_0, stp1_31);
      in1 = _mm_add_epi16(stp1_1, stp1_30);
@ -1933,41 +1934,50 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
      in30 = _mm_srai_epi16(in30, 6);
      in31 = _mm_srai_epi16(in31, 6);

-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }

-      output += 8;
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
+
+      dest += 8 - (stride * 32);
    }
  }
 }
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@ -109,10 +109,6 @@ void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
  add_residual(diff, dest, stride, 16, 16);
 }

-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 32, 32);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                  int width, int height) {
  int r, c;
@ -321,20 +317,16 @@ void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
      input[0] = 0;
 #if !CONFIG_SCATTERSCAN
    } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_32x32(input, output, 64);
-
+      vp9_short_idct10_32x32_add_c(input, dest, stride);
      input[0] = input[1] = input[2] = input[3] = 0;
      input[32] = input[33] = input[34] = 0;
      input[64] = input[65] = 0;
      input[96] = 0;

-      vp9_add_residual_32x32(output, dest, stride);
 #endif
    } else {
-      vp9_short_idct32x32(input, output, 64);
+      vp9_short_idct32x32_add(input, dest, stride);
      vpx_memset(input, 0, 2048);
-      vp9_add_residual_32x32(output, dest, stride);
    }
  }
 }
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@ -181,65 +181,6 @@ void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
  } while (--i);
 }

-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 32;
-  int i = 16;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
-    p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
-    diff += 2 * width;
-    dest += 2 * stride;
-  } while (--i);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                        int stride) {
  uint8_t abs_diff;
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -425,6 +425,7 @@ struct encode_b_args {
  VP9_COMMON *cm;
  MACROBLOCK *x;
  struct optimize_ctx *ctx;
+  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };

 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@ -493,6 +494,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                         int ss_txfrm_size, void *arg) {
  struct encode_b_args* const args = arg;
  MACROBLOCK* const x = args->x;
+  int *wip_txfrm_size = args->wip_txfrm_size;
  MACROBLOCKD* const xd = &x->e_mbd;
  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@ -500,6 +502,10 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
                                                  raster_block,
                                                  xd->plane[plane].diff);
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 xd->plane[plane].dst.buf,
+                                                 xd->plane[plane].dst.stride);
  TX_TYPE tx_type = DCT_DCT;

  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
@ -509,8 +515,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,

  switch (ss_txfrm_size / 2) {
    case TX_32X32:
-      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                          diff, bw * 2);
+        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
+        *wip_txfrm_size = 32;
      break;
    case TX_16X16:
      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@ -521,6 +528,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                           diff, bw, tx_type);
      }
+      *wip_txfrm_size = 16;
      break;
    case TX_8X8:
      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@ -531,6 +539,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
        vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                         diff, bw, tx_type);
      }
+      *wip_txfrm_size = 8;
      break;
    case TX_4X4:
      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@ -544,6 +553,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                         diff, bw, tx_type);
      }
+      *wip_txfrm_size = 4;
      break;
  }
 }
@ -551,7 +561,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                         BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};

  foreach_transformed_block_in_plane(xd, bsize, 0,
                                     xform_quant, &arg);
@ -560,7 +570,7 @@ void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                         BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};

  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@ -569,7 +579,8 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

  vp9_subtract_sby(x, bsize);
  if (x->optimize)
@ -577,15 +588,16 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,

  foreach_transformed_block_in_plane(xd, bsize, 0,
                                     encode_block, &arg);
-
-  vp9_recon_sby(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
 }

 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

  vp9_subtract_sbuv(x, bsize);
  if (x->optimize)
@ -593,20 +605,35 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,

  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-  vp9_recon_sbuv(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sbuv(xd, bsize);
 }

 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                   BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

  vp9_subtract_sb(x, bsize);
  if (x->optimize)
    vp9_optimize_init(xd, bsize, &ctx);
-
+#if 0
  foreach_transformed_block(xd, bsize, encode_block, &arg);

  vp9_recon_sb(xd, bsize);
+#else
+  // wip version... will use foreach_transformed_block when done
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block, &arg);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
+  wip_txfrm_size = 0;
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+  if (wip_txfrm_size < 32)
+    vp9_recon_sbuv(xd, bsize);
+#endif
 }