WIP: 4x4 idct/recon merge

This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: I296604bf73579c45105de0dd1adbcc91bcc53c22
2013-05-20 13:03:17 -04:00 · 2013-05-20 13:03:17 -04:00 · ba48a11130
commit ba48a11130
parent 9aa37a51b2
15 changed files with 124 additions and 246 deletions
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@ -96,11 +96,15 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
  for (int i = 0; i < count_test_block; ++i) {
    int16_t test_input_block[16];
    int16_t test_temp_block[16];
-    int16_t test_output_block[16];
+    uint8_t dst[16], src[16];

+    for (int j = 0; j < 16; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
    // Initialize a test block with input range [-255, 255].
    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];

    // TODO(Yaowu): this should be converted to a parameterized test
    // to test optimized versions of this function.
@ -120,10 +124,10 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
    }

    // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);

    for (int j = 0; j < 16; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
      const int error = diff * diff;
      if (max_error < error)
        max_error = error;
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -393,8 +393,8 @@ typedef struct macroblockd {

  int lossless;
  /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
  void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
  void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
    struct macroblockd *xd);
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@ -18,12 +18,12 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"

-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
  int i;
+  int16_t output[16];
  int a1, b1, c1, d1;
  int16_t *ip = input;
  int16_t *op = output;
-  const int half_pitch = pitch >> 1;

  for (i = 0; i < 4; i++) {
    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
    op[3] = (d1 - c1) >> 1;

    ip += 4;
-    op += half_pitch;
+    op += 4;
  }

  ip = output;
-  op = output;
  for (i = 0; i < 4; i++) {
-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
+    a1 = ip[4 * 0] + ip[4 * 3];
+    b1 = ip[4 * 1] + ip[4 * 2];
+    c1 = ip[4 * 1] - ip[4 * 2];
+    d1 = ip[4 * 0] - ip[4 * 3];


-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
-    op[half_pitch * 1] = (c1 + d1) >> 1;
-    op[half_pitch * 2] = (a1 - b1) >> 1;
-    op[half_pitch * 3] = (d1 - c1) >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((a1 + b1 + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       ((c1 + d1) >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       ((a1 - b1) >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       ((d1 - c1) >> 1));

    ip++;
-    op++;
+    dest++;
  }
 }

-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
  int i;
  int16_t tmp[4];
  int16_t *ip = in;
  int16_t *op = tmp;
-  const int half_pitch = pitch >> 1;

  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

  ip = tmp;
-  op = out;
  for (i = 0; i < 4; i++) {
-    op[half_pitch * 0] = (ip[0] + 1) >> 1;
-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((ip[0] + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       (ip[0] >> 1));
    ip++;
-    op++;
+    dest++;
  }
 }

 void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
                                 uint8_t *dst_ptr,
                                 int pitch, int stride) {
-  int r, c;
  int16_t dc = input_dc;
-  int16_t tmp[4 * 4];
-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
+  vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
 }

 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
  output[3] = step[0] - step[3];
 }

-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
  int16_t out[4 * 4];
  int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
  int i, j;
  int16_t temp_in[4], temp_out[4];

@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
      temp_in[j] = out[j * 4 + i];
    vp9_idct4_1d(temp_in, temp_out);
    for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
  }
 }

-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
  int i;
  int a1;
-  int16_t *op = output;
-  const int half_pitch = pitch >> 1;
  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  out = dct_const_round_shift(out * cospi_16_64);
  a1 = ROUND_POWER_OF_TWO(out, 4);

  for (i = 0; i < 4; i++) {
-    op[0] = op[1] = op[2] = op[3] = a1;
-    op += half_pitch;
+    dest[0] = clip_pixel(dest[0] + a1);
+    dest[1] = clip_pixel(dest[1] + a1);
+    dest[2] = clip_pixel(dest[2] + a1);
+    dest[3] = clip_pixel(dest[3] + a1);
+    dest += dest_stride;
  }
 }

@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
  output[3] = dct_const_round_shift(s3);
 }

-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
  const transform_2d IHT_4[] = {
    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
      temp_in[j] = out[j * 4 + i];
    IHT_4[tx_type].cols(temp_in, temp_out);
    for (j = 0; j < 4; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
  }
 }
-
 static void iadst8_1d(int16_t *input, int16_t *output) {
  int s0, s1, s2, s3, s4, s5, s6, s7;

--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@ -11,11 +11,10 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch) {
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride) {
  if (eob <= 1)
-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
  else
-    xd->inv_txm4x4(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@ -15,7 +15,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch);
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO
 specialize vp9_intra4x4_predict;

 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_4x4 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2

@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4_1
+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_1_add

-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4 sse2
+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_add sse2

 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2
@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32
 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_32x32_add

+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht4x4_add
+
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add

-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht4x4
-
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add

@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2

-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4_1
-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4
-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_inv_walsh_add
+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_1_add
+
+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_add

 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }

-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i eight = _mm_set1_epi16(8);
  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const int half_pitch = pitch >> 1;
  __m128i input0, input1, input2, input3;

  // Rows
@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
  input2 = _mm_srai_epi16(input2, 4);
  input3 = _mm_srai_epi16(input3, 4);

-  // Store results
-  _mm_storel_epi64((__m128i *)output, input2);
-  input2 = _mm_srli_si128(input2, 8);
-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+#define RECON_AND_STORE4X4(dest, in_x) \
+  {                                                     \
+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      *(int *)dest = _mm_cvtsi128_si32(d0); \
+      dest += stride; \
+  }

-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
-  input3 = _mm_srli_si128(input3, 8);
-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+  input0 = _mm_srli_si128(input2, 8);
+  input1 = _mm_srli_si128(input3, 8);
+
+  RECON_AND_STORE4X4(dest, input2);
+  RECON_AND_STORE4X4(dest, input0);
+  RECON_AND_STORE4X4(dest, input1);
+  RECON_AND_STORE4X4(dest, input3);
 }

 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@ -998,14 +998,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
                 pc->uv_dc_delta_q == 0 &&
                 pc->uv_ac_delta_q == 0;
  if (xd->lossless) {
-    xd->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
-    xd->inv_txm4x4        = vp9_short_iwalsh4x4;
    xd->itxm_add          = vp9_idct_add_lossless_c;
    xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;
    xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
  } else {
-    xd->inv_txm4x4_1      = vp9_short_idct4x4_1;
-    xd->inv_txm4x4        = vp9_short_idct4x4;
    xd->itxm_add          = vp9_idct_add;
    xd->itxm_add_y_block  = vp9_idct_add_y_block;
    xd->itxm_add_uv_block = vp9_idct_add_uv_block;
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
  }
 }

-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
-                         int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff[c] + dest[c]);
-
-    dest += stride;
-    diff += width;
-  }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 4, 4);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                  int width, int height) {
  int r, c;
@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
  if (tx_type == DCT_DCT) {
    vp9_idct_add(input, dest, stride, eob);
  } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-    vp9_short_iht4x4(input, output, 4, tx_type);
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
    vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
  }
 }

@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
 }

 void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
  if (eob > 1) {
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4(input, output, 4 << 1);
+    vp9_short_idct4x4_add(input, dest, stride);
    vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
  } else {
    vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
    ((int *)input)[0] = 0;
@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
 }

 void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
  input[0] = dc;
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4(input, output, 4 << 1);
+  vp9_short_idct4x4_add(input, dest, stride);
  vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }

 void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
                             int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
  if (eob > 1) {
-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+    vp9_short_iwalsh4x4_add(input, dest, stride);
    vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
  } else {
-    vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
    ((int *)input)[0] = 0;
  }
 }

 void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
                                int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
  input[0] = dc;
-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+  vp9_short_iwalsh4x4_add(input, dest, stride);
  vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }

 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@ -15,49 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"

-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
-  const int width = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  p0 = _mm_srli_si128(p0, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-  dest += stride;
-
-  p2 = _mm_srli_si128(p2, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                        int stride) {
  uint8_t abs_diff;
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -1207,8 +1207,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
  if (lossless) {
    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
    cpi->mb.optimize              = 0;
    cpi->common.filter_level      = 0;
    cpi->zbin_mode_boost_enabled  = 0;
@ -1216,8 +1216,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
  } else {
    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
  }
 }

--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@ -53,9 +53,6 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
  int16_t* const src_diff =
      raster_block_offset_int16(xd, bsize, 0, ib,
                                x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, bsize, 0, ib,
-                                xd->plane[0].diff);
  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

@ -72,17 +69,15 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
  if (tx_type != DCT_DCT) {
    vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
    x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 4 << bwl, tx_type);
+    vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst,
+                         xd->plane[0].dst.stride, tx_type);
  } else {
    x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
    x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
+    vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],
                                BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 8 << bwl);
+                                dst, xd->plane[0].dst.stride);
  }
-
-  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }

 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -425,7 +425,6 @@ struct encode_b_args {
  VP9_COMMON *cm;
  MACROBLOCK *x;
  struct optimize_ctx *ctx;
-  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };

 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@ -494,14 +493,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                         int ss_txfrm_size, void *arg) {
  struct encode_b_args* const args = arg;
  MACROBLOCK* const x = args->x;
-  int *wip_txfrm_size = args->wip_txfrm_size;
  MACROBLOCKD* const xd = &x->e_mbd;
-  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                       block, ss_txfrm_size);
-  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
-                                                  raster_block,
-                                                  xd->plane[plane].diff);
  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                 raster_block,
                                                 xd->plane[plane].dst.buf,
@ -517,7 +511,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
    case TX_32X32:
        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
                                block, 16), dst, xd->plane[plane].dst.stride);
-        *wip_txfrm_size = 32;
      break;
    case TX_16X16:
      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@ -529,7 +522,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                               block, 16), dst, xd->plane[plane].dst.stride,
                               tx_type);
      }
-      *wip_txfrm_size = 16;
      break;
    case TX_8X8:
      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@ -541,7 +533,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                             block, 16), dst, xd->plane[plane].dst.stride,
                             tx_type);
      }
-      *wip_txfrm_size = 8;
      break;
    case TX_4X4:
      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@ -549,13 +540,13 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
        // this is like vp9_short_idct4x4 but has a special case around eob<=1
        // which is significant (not just an optimization) for the lossless
        // case.
-        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
-            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+        vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,
+            xd->plane[plane].dst.stride);
      } else {
-        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                         diff, bw, tx_type);
+        vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                             dst, xd->plane[plane].dst.stride, tx_type);
      }
-      *wip_txfrm_size = 4;
      break;
  }
 }
@ -563,16 +554,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                         BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};

-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     xform_quant, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
 }

 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                         BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};

  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@ -581,61 +571,37 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};

  vp9_subtract_sby(x, bsize);
  if (x->optimize)
    vp9_optimize_init(xd, bsize, &ctx);

-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }

 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};

  vp9_subtract_sbuv(x, bsize);
  if (x->optimize)
    vp9_optimize_init(xd, bsize, &ctx);

  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
 }

 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                   BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD* const xd = &x->e_mbd;
  struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};

  vp9_subtract_sb(x, bsize);
  if (x->optimize)
    vp9_optimize_init(xd, bsize, &ctx);
-#if 0
+
  foreach_transformed_block(xd, bsize, encode_block, &arg);
-
-  vp9_recon_sb(xd, bsize);
-#else
-  // wip version... will use foreach_transformed_block when done
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
-  wip_txfrm_size = 0;
-
-  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
-#endif
 }
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -1129,11 +1129,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

  cpi->oxcf.lossless = oxcf->lossless;
  if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
  } else {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
  }

  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -592,11 +592,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                BLOCK_SIZE_SB8X8,
                                0, ib,
                                x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                xd->plane[0].diff);
  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
  uint8_t* const dst =
      raster_block_offset_uint8(xd,
@ -668,18 +663,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
  xd->mode_info_context->bmi[ib].as_mode.first =
    (B_PREDICTION_MODE)(*best_mode);

-  // inverse transform
-  if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
-  else
-    xd->inv_txm4x4(best_dqcoeff, diff, 16);
-
  vp9_intra4x4_predict(xd, ib,
                       BLOCK_SIZE_SB8X8,
                       *best_mode,
                       dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff, 8,
-              dst, xd->plane[0].dst.stride);
+
+  // inverse transform
+  if (best_tx_type != DCT_DCT) {
+    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
+                           best_tx_type);
+  } else {
+    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
+  }

  return best_rd;
 }