From 96e41cb461893e240e24950242879a538aee7728 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 14 Mar 2013 09:49:38 -0700
Subject: [PATCH 01/19] Removed shadow warnings : bitstream.c encodeframe.c
 onyx_if.c

ratectrl.c and quantize.c

Adding -Wshadow to CFLAGS generated a bunch of warnings.  This patch
removes these warnings.

Change-Id: I8c8faa9fde57c1c49662d332a90bc8d9a0f4a2ce
---
 vp8/encoder/bitstream.c   | 22 +++++++++++-----------
 vp8/encoder/encodeframe.c |  3 +--
 vp8/encoder/onyx_if.c     |  2 +-
 vp8/encoder/quantize.c    |  6 +++---
 vp8/encoder/ratectrl.c    |  4 ++--
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 8f94171e6..5c41ec8a1 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -90,17 +90,17 @@ static void update_mode(
 
     if (new_b + (n << 8) < old_b)
     {
-        int i = 0;
+        int j = 0;
 
         vp8_write_bit(w, 1);
 
         do
         {
-            const vp8_prob p = Pnew[i];
+            const vp8_prob p = Pnew[j];
 
-            vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+            vp8_write_literal(w, Pcur[j] = p ? p : 1, 8);
         }
-        while (++i < n);
+        while (++j < n);
     }
     else
         vp8_write_bit(w, 0);
@@ -245,15 +245,15 @@ void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 
             if (L)
             {
-                const unsigned char *pp = b->prob;
-                int v = e >> 1;
-                int n = L;              /* number of bits in v, assumed nonzero */
-                int i = 0;
+                const unsigned char *proba = b->prob;
+                const int v2 = e >> 1;
+                int n2 = L;              /* number of bits in v2, assumed nonzero */
+                i = 0;
 
                 do
                 {
-                    const int bb = (v >> --n) & 1;
-                    split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                    const int bb = (v2 >> --n2) & 1;
+                    split = 1 + (((range - 1) * proba[i>>1]) >> 8);
                     i = b->tree[i+bb];
 
                     if (bb)
@@ -301,7 +301,7 @@ void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 
                     lowvalue <<= shift;
                 }
-                while (n);
+                while (n2);
             }
 
 
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index d1b647be9..07bc33ced 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -852,11 +852,10 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
             if (xd->segmentation_enabled)
             {
-                int i, j;
+                int j;
 
                 if (xd->segmentation_enabled)
                 {
-
                     for (i = 0; i < cpi->encoding_thread_count; i++)
                     {
                         for (j = 0; j < 4; j++)
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 555a9e4bc..60dd856f4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -826,7 +826,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         {
             unsigned int sum = 0;
             unsigned int total_mbs = cm->MBs;
-            int i, thresh;
+            int thresh;
             unsigned int total_skip;
 
             int min = 2000;
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 4e2fef793..fda997ff6 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -184,17 +184,17 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
     for (i = 0; i < 16; i++)
     {
         int dq;
-        int round;
+        int rounding;
 
         /*TODO: These arrays should be stored in zig-zag order.*/
         rc = vp8_default_zig_zag1d[i];
         z = coeff_ptr[rc];
         dq = dequant_ptr[rc];
-        round = dq >> 1;
+        rounding = dq >> 1;
         /* Sign of z. */
         sz = -(z < 0);
         x = (z + sz) ^ sz;
-        x += round;
+        x += rounding;
         if (x >= dq)
         {
             /* Quantize x. */
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 65fd0c5be..8e3c01d93 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -614,7 +614,6 @@ static void calc_gf_params(VP8_COMP *cpi)
 static void calc_pframe_target_size(VP8_COMP *cpi)
 {
     int min_frame_target;
-    int Adjustment;
     int old_per_frame_bandwidth = cpi->per_frame_bandwidth;
 
     if ( cpi->current_layer > 0)
@@ -658,6 +657,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
         /* 1 pass */
         else
         {
+            int Adjustment;
             /* Make rate adjustment to recover bits spent in key frame
              * Test to see if the key frame inter data rate correction
              * should still be in force
@@ -688,7 +688,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
              */
             if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
             {
-                int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
+                Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
 
                 if (Adjustment > (cpi->this_frame_target - min_frame_target))
                     Adjustment = (cpi->this_frame_target - min_frame_target);

From 5d79720d57a4356e5dacfdb66b4afce17dcd34f5 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 14 Mar 2013 14:23:13 -0700
Subject: [PATCH 02/19] Removed shadow warnings : mcomp.c rdopt.c

Adding -Wshadow to CFLAGS generated a bunch of warnings.  This patch
removes these warnings.

Change-Id: Ib498de4b8652051d257cf86dcb40d2968a5013ae
---
 vp8/encoder/mcomp.c | 12 ++++++------
 vp8/encoder/rdopt.c | 19 +++++++++----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index a34af6428..038213007 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -233,7 +233,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 
 #if ARCH_X86 || ARCH_X86_64
     MACROBLOCKD *xd = &x->e_mbd;
-    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
     unsigned char *y;
     int buf_r1, buf_r2, buf_c1;
 
@@ -244,7 +244,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     y_stride = 32;
 
     /* Copy to intermediate buffer before searching. */
-    vfp->copymem(y0 - buf_c1 - pre_stride*buf_r1, pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
+    vfp->copymem(y_0 - buf_c1 - pre_stride*buf_r1, pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
     y = xd->y_buf + y_stride*buf_r1 +buf_c1;
 #else
     unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
@@ -375,12 +375,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 
 #if ARCH_X86 || ARCH_X86_64
     MACROBLOCKD *xd = &x->e_mbd;
-    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
     unsigned char *y;
 
     y_stride = 32;
     /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-     vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+     vfp->copymem(y_0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
      y = xd->y_buf + y_stride + 1;
 #else
      unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
@@ -686,12 +686,12 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 
 #if ARCH_X86 || ARCH_X86_64
     MACROBLOCKD *xd = &x->e_mbd;
-    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
     unsigned char *y;
 
     y_stride = 32;
     /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-    vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+    vfp->copymem(y_0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
     y = xd->y_buf + y_stride + 1;
 #else
     unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3d60bebda..57114fb64 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -884,8 +884,8 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
 
     for (mode = DC_PRED; mode <= TM_PRED; mode++)
     {
-        int rate;
-        int distortion;
+        int this_rate;
+        int this_distortion;
         int this_rd;
 
         xd->mode_info_context->mbmi.uv_mode = mode;
@@ -907,17 +907,17 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
         vp8_quantize_mbuv(x);
 
         rate_to = rd_cost_mbuv(x);
-        rate = rate_to + x->intra_uv_mode_cost[xd->frame_type][xd->mode_info_context->mbmi.uv_mode];
+        this_rate = rate_to + x->intra_uv_mode_cost[xd->frame_type][xd->mode_info_context->mbmi.uv_mode];
 
-        distortion = vp8_mbuverror(x) / 4;
+        this_distortion = vp8_mbuverror(x) / 4;
 
-        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
         if (this_rd < best_rd)
         {
             best_rd = this_rd;
-            d = distortion;
-            r = rate;
+            d = this_distortion;
+            r = this_rate;
             *rate_tokenonly = rate_to;
             mode_selected = mode;
         }
@@ -1294,12 +1294,11 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
 
                 if (bestsme < INT_MAX)
                 {
-                    int distortion;
+                    int disto;
                     unsigned int sse;
                     cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
                         bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost,
-                        &distortion, &sse);
-
+                        &disto, &sse);
                 }
             } /* NEW4X4 */
 

From ae64e7b408b32a8c6c3ce94c1811b6c47d884a48 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 14 Mar 2013 14:45:23 -0700
Subject: [PATCH 03/19] Removed shadow warnings : postproc.c decodframe.c
 threading.c

and denoising.c
Adding -Wshadow to CFLAGS generated a bunch of warnings.  This patch
removes these warnings.

Change-Id: I434a9f4bfac9ad4ab7d2a67a35ef21e6636280da
---
 vp8/common/postproc.c    | 77 ++++++++++++++++++++--------------------
 vp8/decoder/decodframe.c |  6 ++--
 vp8/decoder/threading.c  |  1 -
 vp8/encoder/denoising.c  |  2 --
 4 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index e40fb111c..0266f4c09 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -439,29 +439,28 @@ static void fillrd(struct postproc_state *state, int q, int a)
     char char_dist[300];
 
     double sigma;
-    int ai = a, qi = q, i;
+    int i;
 
     vp8_clear_system_state();
 
 
-    sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+    sigma = a + .5 + .6 * (63 - q) / 63.0;
 
     /* set up a lookup table of 256 entries that matches
      * a gaussian distribution with sigma determined by q.
      */
     {
-        double i;
         int next, j;
 
         next = 0;
 
         for (i = -32; i < 32; i++)
         {
-            int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+            const int v = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
 
-            if (a)
+            if (v)
             {
-                for (j = 0; j < a; j++)
+                for (j = 0; j < v; j++)
                 {
                     char_dist[next+j] = (char) i;
                 }
@@ -544,12 +543,12 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
  * filled with the same color block.
  */
 void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
-                        int y1, int u1, int v1, int alpha, int stride)
+                        int y_1, int u_1, int v_1, int alpha, int stride)
 {
     int i, j;
-    int y1_const = y1*((1<<16)-alpha);
-    int u1_const = u1*((1<<16)-alpha);
-    int v1_const = v1*((1<<16)-alpha);
+    int y1_const = y_1*((1<<16)-alpha);
+    int u1_const = u_1*((1<<16)-alpha);
+    int v1_const = v_1*((1<<16)-alpha);
 
     y += 2*stride + 2;
     for (i = 0; i < 12; i++)
@@ -582,12 +581,12 @@ void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
  * unblended to allow for other visualizations to be layered.
  */
 void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
-                        int y1, int u1, int v1, int alpha, int stride)
+                        int y_1, int u_1, int v_1, int alpha, int stride)
 {
     int i, j;
-    int y1_const = y1*((1<<16)-alpha);
-    int u1_const = u1*((1<<16)-alpha);
-    int v1_const = v1*((1<<16)-alpha);
+    int y1_const = y_1*((1<<16)-alpha);
+    int u1_const = u_1*((1<<16)-alpha);
+    int v1_const = v_1*((1<<16)-alpha);
 
     for (i = 0; i < 2; i++)
     {
@@ -646,12 +645,12 @@ void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
 }
 
 void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
-                        int y1, int u1, int v1, int alpha, int stride)
+                        int y_1, int u_1, int v_1, int alpha, int stride)
 {
     int i, j;
-    int y1_const = y1*((1<<16)-alpha);
-    int u1_const = u1*((1<<16)-alpha);
-    int v1_const = v1*((1<<16)-alpha);
+    int y1_const = y_1*((1<<16)-alpha);
+    int u1_const = u_1*((1<<16)-alpha);
+    int v1_const = v_1*((1<<16)-alpha);
 
     for (i = 0; i < 4; i++)
     {
@@ -676,46 +675,46 @@ void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
     }
 }
 
-static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
+static void constrain_line (int x_0, int *x_1, int y_0, int *y_1, int width, int height)
 {
     int dx;
     int dy;
 
-    if (*x1 > width)
+    if (*x_1 > width)
     {
-        dx = *x1 - x0;
-        dy = *y1 - y0;
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
 
-        *x1 = width;
+        *x_1 = width;
         if (dx)
-            *y1 = ((width-x0)*dy)/dx + y0;
+            *y_1 = ((width-x_0)*dy)/dx + y_0;
     }
-    if (*x1 < 0)
+    if (*x_1 < 0)
     {
-        dx = *x1 - x0;
-        dy = *y1 - y0;
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
 
-        *x1 = 0;
+        *x_1 = 0;
         if (dx)
-            *y1 = ((0-x0)*dy)/dx + y0;
+            *y_1 = ((0-x_0)*dy)/dx + y_0;
     }
-    if (*y1 > height)
+    if (*y_1 > height)
     {
-        dx = *x1 - x0;
-        dy = *y1 - y0;
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
 
-        *y1 = height;
+        *y_1 = height;
         if (dy)
-            *x1 = ((height-y0)*dx)/dy + x0;
+            *x_1 = ((height-y_0)*dx)/dy + x_0;
     }
-    if (*y1 < 0)
+    if (*y_1 < 0)
     {
-        dx = *x1 - x0;
-        dy = *y1 - y0;
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
 
-        *y1 = 0;
+        *y_1 = 0;
         if (dy)
-            *x1 = ((0-y0)*dx)/dy + x0;
+            *x_1 = ((0-y_0)*dx)/dy + x_0;
     }
 }
 
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 6f8282a64..a6b193b07 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -1335,11 +1335,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 #if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
     {
-        unsigned int i;
+        unsigned int thread;
         vp8mt_decode_mb_rows(pbi, xd);
         vp8_yv12_extend_frame_borders(yv12_fb_new);
-        for (i = 0; i < pbi->decoding_thread_count; ++i)
-            corrupt_tokens |= pbi->mb_row_di[i].mbd.corrupted;
+        for (thread = 0; thread < pbi->decoding_thread_count; ++thread)
+            corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted;
     }
     else
 #endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 73f9a8356..73031898e 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -343,7 +343,6 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
 
     for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
     {
-       int i;
        int recon_yoffset, recon_uvoffset;
        int mb_col;
        int filter_level;
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 1ee1cb59f..781926547 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -206,8 +206,6 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         MB_MODE_INFO saved_mbmi;
         MACROBLOCKD *filter_xd = &x->e_mbd;
         MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
-        int mv_col;
-        int mv_row;
         int sse_diff = zero_mv_sse - best_sse;
 
         saved_mbmi = *mbmi;

From c4195e0eb8671fcbc88be59c117e74dfe0dac402 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 4 Apr 2013 19:00:31 -0700
Subject: [PATCH 04/19] tests: use a portable rand() implementation

the one from gtest in this case: testing::internal::Random.
this will make the tests deterministic between platforms. addresses
issue #568.

Change-Id: I5a8a92f5c33f52cb0a219c1dd3d02335acbbf163
---
 test/acm_random.h    | 22 ++++++++++++----------
 test/fdct8x8_test.cc | 28 ++++++++++++++++++----------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/test/acm_random.h b/test/acm_random.h
index 514894eda..13903c66a 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -11,7 +11,7 @@
 #ifndef LIBVPX_TEST_ACM_RANDOM_H_
 #define LIBVPX_TEST_ACM_RANDOM_H_
 
-#include <stdlib.h>
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "vpx/vpx_integer.h"
 
@@ -19,24 +19,23 @@ namespace libvpx_test {
 
 class ACMRandom {
  public:
-  ACMRandom() {
-    Reset(DeterministicSeed());
-  }
+  ACMRandom() : random_(DeterministicSeed()) {}
 
-  explicit ACMRandom(int seed) {
-    Reset(seed);
-  }
+  explicit ACMRandom(int seed) : random_(seed) {}
 
   void Reset(int seed) {
-    srand(seed);
+    random_.Reseed(seed);
   }
 
   uint8_t Rand8(void) {
-    return (rand() >> 8) & 0xff;
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 24) & 0xff;
   }
 
   int PseudoUniform(int range) {
-    return (rand() >> 8) % range;
+    return random_.Generate(range);
   }
 
   int operator()(int n) {
@@ -46,6 +45,9 @@ class ACMRandom {
   static int DeterministicSeed(void) {
     return 0xbaba;
   }
+
+ private:
+  testing::internal::Random random_;
 };
 
 }  // namespace libvpx_test
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index d82f7c3bd..5967d36c4 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -51,11 +51,15 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
   }
 
   for (int j = 0; j < 64; ++j) {
-    const bool bias_acceptable = (abs(count_sign_block[j][0] -
-                                      count_sign_block[j][1]) < 1000);
-    EXPECT_TRUE(bias_acceptable)
-        << "Error: 8x8 FDCT has a sign bias > 1%"
-        << " for input range [-255, 255] at index " << j;
+    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
+    const int max_diff = 1125;
+    EXPECT_LT(diff, max_diff)
+        << "Error: 8x8 FDCT has a sign bias > "
+        << 1. * max_diff / count_test_block * 100 << "%"
+        << " for input range [-255, 255] at index " << j
+        << " count0: " << count_sign_block[j][0]
+        << " count1: " << count_sign_block[j][1]
+        << " diff: " << diff;
   }
 
   memset(count_sign_block, 0, sizeof(count_sign_block));
@@ -76,11 +80,15 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
   }
 
   for (int j = 0; j < 64; ++j) {
-    const bool bias_acceptable = (abs(count_sign_block[j][0] -
-                                      count_sign_block[j][1]) < 10000);
-    EXPECT_TRUE(bias_acceptable)
-        << "Error: 8x8 FDCT has a sign bias > 10%"
-        << " for input range [-15, 15] at index " << j;
+    const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
+    const int max_diff = 10000;
+    EXPECT_LT(diff, max_diff)
+        << "Error: 4x4 FDCT has a sign bias > "
+        << 1. * max_diff / count_test_block * 100 << "%"
+        << " for input range [-15, 15] at index " << j
+        << " count0: " << count_sign_block[j][0]
+        << " count1: " << count_sign_block[j][1]
+        << " diff: " << diff;
   }
 };
 

From 8b4b28a5ea877a2d5c3b818f614b6b0d5205cea5 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 5 Apr 2013 11:56:54 -0700
Subject: [PATCH 05/19] fix make test invocation for msvc win64

Change-Id: If5d4b7ffa67223ed72b53a6c9b9e42b4de5718f2
---
 build/make/configure.sh | 2 ++
 libs.mk                 | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 1ac303525..050ae57a7 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1087,10 +1087,12 @@ EOF
             win32)
                 add_asflags -f win32
                 enabled debug && add_asflags -g cv8
+                EXE_SFX=.exe
             ;;
             win64)
                 add_asflags -f x64
                 enabled debug && add_asflags -g cv8
+                EXE_SFX=.exe
             ;;
             linux*|solaris*|android*)
                 add_asflags -f elf${bits}
diff --git a/libs.mk b/libs.mk
index 872a16bae..2281bd059 100644
--- a/libs.mk
+++ b/libs.mk
@@ -436,7 +436,7 @@ test_libvpx.vcproj: $(LIBVPX_TEST_SRCS)
 PROJECTS-$(CONFIG_MSVS) += test_libvpx.vcproj
 
 test:: testdata
-	@set -e; for t in $(addprefix Win32/Release/,$(notdir $(LIBVPX_TEST_BINS:.cc=.exe))); do $$t; done
+	@set -e; for t in $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BINS:.cc=.exe))); do $$t; done
 endif
 else
 

From 282c963923eb969c146d63e934bbece433a95282 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Thu, 11 Apr 2013 18:19:18 -0700
Subject: [PATCH 06/19] Fix for multi-res-encoding:

Use local variable for setting the improved prediction mode.
cpi->sf.improved_mv_pred is set/fixed at the frame level
and should not be changed inside pick_inter_mode.

Change-Id: Ie28d9171ac000e631af0e30204970e3d4fff3078
---
 vp8/encoder/pickinter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 4c2527d68..c5279fed2 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -594,6 +594,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX;
 #endif
 
+    int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
     int_mv mvp;
 
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -882,7 +883,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                last frame motion info is not stored, then we can not
                use improved_mv_pred. */
             if (cpi->oxcf.mr_encoder_id && !parent_ref_valid)
-                cpi->sf.improved_mv_pred = 0;
+                sf_improved_mv_pred = 0;
 
             if (parent_ref_valid && parent_ref_frame)
             {
@@ -899,7 +900,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             }else
 #endif
             {
-                if(cpi->sf.improved_mv_pred)
+                if(sf_improved_mv_pred)
                 {
                     if(!saddone)
                     {

From 6c3f06a4d78973dafbdfe6effe7f3fd3fdc8c643 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Mon, 15 Apr 2013 12:19:06 -0700
Subject: [PATCH 07/19] Include RTCD header in encodeframe.c

The file uses functions defined in vp8_rtcd.h but did not include the
header.

Change-Id: I110196ddc9181e533be1fe656e21c1791cabe226
---
 vp8/encoder/encodeframe.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index d1b647be9..d17ed370b 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -10,6 +10,7 @@
 
 
 #include "vpx_config.h"
+#include "vp8_rtcd.h"
 #include "encodemb.h"
 #include "encodemv.h"
 #include "vp8/common/common.h"

From 94649bc0ef85813dd42834a3df985753864b9be6 Mon Sep 17 00:00:00 2001
From: Jim Bankoski <jimbankoski@google.com>
Date: Tue, 16 Apr 2013 14:49:30 -0700
Subject: [PATCH 08/19] set up a speed 1

slightly worse results for faster encodes

Change-Id: Ic5b38fcde7a2e334c4724e125b558bcb97783af6
---
 vp9/encoder/vp9_rdopt.c | 136 ++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 47 deletions(-)

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0083e8ae1..7434e5cdd 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -290,7 +290,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
       } else {
         cpi->rd_threshes[i] = INT_MAX;
       }
-
       cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
     }
   } else {
@@ -302,7 +301,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
       } else {
         cpi->rd_threshes[i] = INT_MAX;
       }
-
       cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
     }
   }
@@ -4319,6 +4317,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
            yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
         this_mode != ZEROMV)
       continue;
+
     if (mbmi->second_ref_frame > 0 &&
           (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
            yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
@@ -5204,6 +5203,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
   MB_MODE_INFO best_mbmode;
+  int j;
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs[MAX_REF_FRAMES];
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -5225,6 +5225,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
   MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;
   struct scale_factors scale_factor[4];
+  unsigned int ref_frame_mask = 0;
+  unsigned int mode_mask = 0;
 
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
@@ -5235,58 +5237,87 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < NB_TXFM_MODES; i++)
     best_txfm_rd[i] = INT64_MAX;
 
+  // Create a mask set to 1 for each frame used by a smaller resolution.p
+  if (cpi->Speed > 0) {
+    switch (block_size) {
+      case BLOCK_64X64:
+        for (i = 0; i < 4; i++) {
+          for (j = 0; j < 4; j++) {
+            ref_frame_mask |= (1 << x->mb_context[i][j].mic.mbmi.ref_frame);
+            mode_mask |= (1 << x->mb_context[i][j].mic.mbmi.mode);
+          }
+        }
+        for (i = 0; i < 4; i++) {
+          ref_frame_mask |= (1 << x->sb32_context[i].mic.mbmi.ref_frame);
+          mode_mask |= (1 << x->sb32_context[i].mic.mbmi.mode);
+        }
+        break;
+      case BLOCK_32X32:
+        for (i = 0; i < 4; i++) {
+          ref_frame_mask |= (1
+              << x->mb_context[xd->sb_index][i].mic.mbmi.ref_frame);
+          mode_mask |= (1 << x->mb_context[xd->sb_index][i].mic.mbmi.mode);
+        }
+        break;
+    }
+  }
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
-                         mb_row, mb_col, frame_mv[NEARESTMV],
-                         frame_mv[NEARMV], frame_mdcounts,
-                         yv12_mb, scale_factor);
+                         mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         frame_mdcounts, yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
-
-  if (block_size == BLOCK_64X64) {
-    mbmi->mode = DC_PRED;
-    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-      mbmi->txfm_size = TX_4X4;
-      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
+  // Disallow intra if none of the smaller prediction sizes used intra and
+  // speed > 0 ;
+  if (cpi->Speed == 0
+      || ( cpi->Speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
+    if (block_size == BLOCK_64X64) {
+      mbmi->mode = DC_PRED;
+      if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
+        mbmi->txfm_size = TX_4X4;
+        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
+                                  &dist_uv_4x4, &uv_skip_4x4);
+        mode_uv_4x4 = mbmi->uv_mode;
+      }
+      if (cm->txfm_mode != ONLY_4X4) {
+        mbmi->txfm_size = TX_8X8;
+        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
+                                  &dist_uv_8x8, &uv_skip_8x8);
+        mode_uv_8x8 = mbmi->uv_mode;
+      }
+      if (cm->txfm_mode >= ALLOW_32X32) {
+        mbmi->txfm_size = TX_32X32;
+        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,
+                                  &rate_uv_tokenonly_16x16, &dist_uv_16x16,
+                                  &uv_skip_16x16);
+        mode_uv_16x16 = mbmi->uv_mode;
+      }
+    } else {
+      assert(block_size == BLOCK_32X32);
+      mbmi->mode = DC_PRED;
+      if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
+        mbmi->txfm_size = TX_4X4;
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
                                 &dist_uv_4x4, &uv_skip_4x4);
-      mode_uv_4x4 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode != ONLY_4X4) {
-      mbmi->txfm_size = TX_8X8;
-      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
+        mode_uv_4x4 = mbmi->uv_mode;
+      }
+      if (cm->txfm_mode != ONLY_4X4) {
+        mbmi->txfm_size = TX_8X8;
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
                                 &dist_uv_8x8, &uv_skip_8x8);
-      mode_uv_8x8 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode >= ALLOW_32X32) {
-      mbmi->txfm_size = TX_32X32;
-      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,
-                                &rate_uv_tokenonly_16x16,
-                                &dist_uv_16x16, &uv_skip_16x16);
-      mode_uv_16x16 = mbmi->uv_mode;
-    }
-  } else {
-    assert(block_size == BLOCK_32X32);
-    mbmi->mode = DC_PRED;
-    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-      mbmi->txfm_size = TX_4X4;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                              &dist_uv_4x4, &uv_skip_4x4);
-      mode_uv_4x4 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode != ONLY_4X4) {
-      mbmi->txfm_size = TX_8X8;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                              &dist_uv_8x8, &uv_skip_8x8);
-      mode_uv_8x8 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode >= ALLOW_32X32) {
-      mbmi->txfm_size = TX_32X32;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
-                              &dist_uv_16x16, &uv_skip_16x16);
-      mode_uv_16x16 = mbmi->uv_mode;
+        mode_uv_8x8 = mbmi->uv_mode;
+      }
+      if (cm->txfm_mode >= ALLOW_32X32) {
+        mbmi->txfm_size = TX_32X32;
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16,
+                                &rate_uv_tokenonly_16x16, &dist_uv_16x16,
+                                &uv_skip_16x16);
+        mode_uv_16x16 = mbmi->uv_mode;
+      }
     }
   }
 
@@ -5313,10 +5344,21 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
-    if (!(ref_frame == INTRA_FRAME ||
-          (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+    if (!(ref_frame == INTRA_FRAME
+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
       continue;
     }
+    if (cpi->Speed > 0) {
+      if (!(ref_frame_mask & (1 << ref_frame))) {
+        continue;
+      }
+      if (vp9_mode_order[mode_index].second_ref_frame != NONE
+          && !(ref_frame_mask
+              & (1 << vp9_mode_order[mode_index].second_ref_frame))) {
+        continue;
+      }
+    }
+
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
     set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

From 5b6d33f9afee6da7a370cdd94aae4ac85c8588c9 Mon Sep 17 00:00:00 2001
From: Christian Duvivier <cduvivier@google.com>
Date: Mon, 25 Mar 2013 16:18:38 -0700
Subject: [PATCH 09/19] Faster vp9_short_fdct4x4 and vp9_short_fdct8x4.

Scalar path is about 1.3x faster (2.1% overall encoder speedup).
SSE2 path is about 5.0x faster (8.4% overall encoder speedup).

Change-Id: I360d167b5ad6f387bba00406129323e2fe6e7dda
---
 vp9/common/vp9_rtcd_defs.sh               |   4 +-
 vp9/encoder/vp9_dct.c                     |  82 ++--
 vp9/encoder/x86/vp9_dct_sse2.asm          | 432 ----------------------
 vp9/encoder/x86/vp9_dct_sse2_intrinsics.c | 105 ++++++
 vp9/vp9cx.mk                              |   1 -
 5 files changed, 167 insertions(+), 457 deletions(-)
 delete mode 100644 vp9/encoder/x86/vp9_dct_sse2.asm

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8b6efc384..46495cb11 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -645,10 +645,10 @@ prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int p
 specialize vp9_short_fdct8x8 sse2
 
 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct4x4
+specialize vp9_short_fdct4x4 sse2
 
 prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x4
+specialize vp9_short_fdct8x4 sse2
 
 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct32x32
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index aeef9c6df..ebf40e4e6 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -37,30 +37,68 @@ static void fdct4_1d(int16_t *input, int16_t *output) {
 }
 
 void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
-  int16_t out[4 * 4];
-  int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
-  int i, j;
-  int16_t temp_in[4], temp_out[4];
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 4;
-    if (i == 0 && temp_in[0])
-      temp_in[0] += 1;
-    fdct4_1d(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      outptr[j * 4 + i] = temp_out[j];
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  const int stride = pitch >> 1;
+  int pass;
+  // We need an intermediate buffer between passes.
+  int16_t intermediate[4 * 4];
+  int16_t *in = input;
+  int16_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    /*canbe16*/ int input[4];
+    /*canbe16*/ int step[4];
+    /*needs32*/ int temp1, temp2;
+    int i;
+    for (i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (0 == pass) {
+        input[0] = in[0 * stride] << 4;
+        input[1] = in[1 * stride] << 4;
+        input[2] = in[2 * stride] << 4;
+        input[3] = in[3 * stride] << 4;
+        if (i == 0 && input[0]) {
+          input[0] += 1;
+        }
+      } else {
+        input[0] = in[0 * 4];
+        input[1] = in[1 * 4];
+        input[2] = in[2 * 4];
+        input[3] = in[3 * 4];
+      }
+      // Transform.
+      step[0] = input[0] + input[3];
+      step[1] = input[1] + input[2];
+      step[2] = input[1] - input[2];
+      step[3] = input[0] - input[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = dct_const_round_shift(temp1);
+      out[2] = dct_const_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = dct_const_round_shift(temp1);
+      out[3] = dct_const_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
   }
 
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j + i * 4];
-    fdct4_1d(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+  {
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+    }
   }
 }
 
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
deleted file mode 100644
index bbd6086da..000000000
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ /dev/null
@@ -1,432 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE 0
-%if ABI_IS_32BIT
-  %define       input       rsi
-  %define       output      rdi
-  %define       pitch       rax
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov         rsi, arg(0)
-    mov         rdi, arg(1)
-
-    movsxd      rax, dword ptr arg(2)
-    lea         rcx, [rsi + rax*2]
-%else
-  %if LIBVPX_YASM_WIN64
-    %define     input       rcx
-    %define     output      rdx
-    %define     pitch       r8
-    SAVE_XMM 7, u
-  %else
-    %define     input       rdi
-    %define     output      rsi
-    %define     pitch       rdx
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY 0
-  %define     input
-  %define     output
-  %define     pitch
-
-%if ABI_IS_32BIT
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_sse2) PRIVATE
-sym(vp9_short_fdct4x4_sse2):
-
-    STACK_FRAME_CREATE
-
-    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
-    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
-    lea         input,          [input+2*pitch]
-    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
-    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
-
-    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
-    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
-
-    movdqa      xmm2, xmm0
-    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
-    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
-    movdqa      xmm1, xmm0
-    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
-    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
-    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
-
-    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
-    movdqa      xmm3, xmm0
-    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
-    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
-    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
-    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
-
-    movdqa      xmm1, xmm0
-    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
-    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
-    movdqa      xmm4, xmm3
-    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
-    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
-
-    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
-    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
-    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
-    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
-
-    packssdw    xmm0, xmm1                      ;op[2] op[0]
-    packssdw    xmm3, xmm4                      ;op[3] op[1]
-    ; 23 22 21 20 03 02 01 00
-    ;
-    ; 33 32 31 30 13 12 11 10
-    ;
-    movdqa      xmm2, xmm0
-    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
-    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
-
-    movdqa      xmm3, xmm0
-    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
-    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
-    movdqa      xmm2, xmm0
-    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
-    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
-
-    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
-    pshufd      xmm2, xmm2, 04eh
-    movdqa      xmm3, xmm0
-    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
-    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
-
-    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
-    movdqa      xmm2, xmm3                      ;save d1 for compare
-    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
-    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
-    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
-    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
-    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
-    movdqa      xmm1, xmm0
-    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
-    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
-
-    pxor        xmm4, xmm4                      ;zero out for compare
-    paddd       xmm0, xmm5
-    paddd       xmm1, xmm5
-    pcmpeqw     xmm2, xmm4
-    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
-    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
-    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
-                                                     ;and keep bit 0 of lower
-
-    movdqa      xmm4, xmm3
-    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
-    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
-    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
-    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
-    packssdw    xmm0, xmm1                      ;op[8] op[0]
-    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
-    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
-
-    packssdw    xmm3, xmm4                      ;op[12] op[4]
-    movdqa      xmm1, xmm0
-    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
-    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
-    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
-
-    movdqa      XMMWORD PTR[output +  0], xmm0
-    movdqa      XMMWORD PTR[output + 16], xmm1
-
-    STACK_FRAME_DESTROY
-
-;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct8x4_sse2) PRIVATE
-sym(vp9_short_fdct8x4_sse2):
-
-    STACK_FRAME_CREATE
-
-        ; read the input data
-        movdqa      xmm0,       [input        ]
-        movdqa      xmm2,       [input+  pitch]
-        lea         input,      [input+2*pitch]
-        movdqa      xmm4,       [input        ]
-        movdqa      xmm3,       [input+  pitch]
-
-        ; transpose for the first stage
-        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
-
-        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
-
-        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
-
-        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
-
-        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
-
-        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
-
-        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
-
-        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
-
-        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3        ; a1 = 0 + 3
-        paddw       xmm1,       xmm2        ; b1 = 1 + 2
-
-        psubw       xmm4,       xmm2        ; c1 = 1 - 2
-        psubw       xmm5,       xmm3        ; d1 = 0 - 3
-
-        psllw       xmm5,        3
-        psllw       xmm4,        3
-
-        psllw       xmm0,        3
-        psllw       xmm1,        3
-
-        ; output 0 and 2
-        movdqa      xmm2,       xmm0        ; a1
-
-        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
-        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movdqa      xmm1,       xmm5        ; d1
-        punpcklwd   xmm1,       xmm4        ; c1 d1
-        punpckhwd   xmm5,       xmm4        ; c1 d1
-
-        movdqa      xmm3,       xmm1
-        movdqa      xmm4,       xmm5
-
-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
-
-        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    xmm1,       xmm4        ; op[1]
-        packssdw    xmm3,       xmm5        ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
-
-        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-
-        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
-
-        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
-
-        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
-
-        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
-
-        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
-
-        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
-
-        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
-
-        ; xmm0 0
-        ; xmm1 4
-        ; xmm2 1
-        ; xmm3 3
-
-        movdqa      xmm5,       xmm0
-        movdqa      xmm2,       xmm1
-
-        paddw       xmm0,       xmm3        ; a1 = 0 + 3
-        paddw       xmm1,       xmm4        ; b1 = 1 + 2
-
-        psubw       xmm4,       xmm2        ; c1 = 1 - 2
-        psubw       xmm5,       xmm3        ; d1 = 0 - 3
-
-        pxor        xmm6,       xmm6        ; zero out for compare
-
-        pcmpeqw     xmm6,       xmm5        ; d1 != 0
-
-        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
-                                                                    ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movdqa      xmm2,       xmm0        ; a1
-
-        paddw       xmm0,       xmm1        ; a1 + b1
-        psubw       xmm2,       xmm1        ; a1 - b1
-
-        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
-        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
-
-        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movdqa      xmm1,       xmm5        ; d1
-        punpcklwd   xmm1,       xmm4        ; c1 d1
-        punpckhwd   xmm5,       xmm4        ; c1 d1
-
-        movdqa      xmm3,       xmm1
-        movdqa      xmm4,       xmm5
-
-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
-
-        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    xmm1,       xmm4        ; op[4]
-        packssdw    xmm3,       xmm5        ; op[12]
-
-        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
-
-        movdqa      xmm4,       xmm0
-        movdqa      xmm5,       xmm2
-
-        punpcklqdq  xmm0,       xmm1
-        punpckhqdq  xmm4,       xmm1
-
-        punpcklqdq  xmm2,       xmm3
-        punpckhqdq  xmm5,       xmm3
-
-        movdqa      XMMWORD PTR[output + 0 ],  xmm0
-        movdqa      XMMWORD PTR[output + 16],  xmm2
-        movdqa      XMMWORD PTR[output + 32],  xmm4
-        movdqa      XMMWORD PTR[output + 48],  xmm5
-
-    STACK_FRAME_DESTROY
-
-SECTION_RODATA
-align 16
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 16
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 16
-_mult_add:
-    times 8 dw 1
-align 16
-_cmp_mask:
-    times 4 dw 1
-    times 4 dw 0
-align 16
-_cmp_mask8x4:
-    times 8 dw 1
-align 16
-_mult_sub:
-    dw 1
-    dw -1
-    dw 1
-    dw -1
-    dw 1
-    dw -1
-    dw 1
-    dw -1
-align 16
-_7:
-    times 4 dd 7
-align 16
-_7w:
-    times 8 dw 7
-align 16
-_14500:
-    times 4 dd 14500
-align 16
-_7500:
-    times 4 dd 7500
-align 16
-_12000:
-    times 4 dd 12000
-align 16
-_51000:
-    times 4 dd 51000
diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
index 358d979eb..49cb837e0 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
+++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
@@ -11,6 +11,111 @@
 #include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 
+void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  const int stride = pitch >> 1;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in0, in1, in2, in3;
+  // Load inputs.
+  {
+    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
+    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
+    // x = x << 4
+    in0 = _mm_slli_epi16(in0, 4);
+    in1 = _mm_slli_epi16(in1, 4);
+    in2 = _mm_slli_epi16(in2, 4);
+    in3 = _mm_slli_epi16(in3, 4);
+    // if (i == 0 && input[0]) input[0] += 1;
+    {
+      // The mask will only contain wether the first value is zero, all
+      // other comparison will fail as something shifted by 4 (above << 4)
+      // can never be equal to one. To increment in the non-zero case, we
+      // add the mask and one for the first element:
+      //   - if zero, mask = -1, v = v - 1 + 1 = v
+      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+      in0 = _mm_add_epi16(in0, mask);
+      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+    }
+  }
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // Transform 1/2: Add/substract
+    const __m128i r0 = _mm_add_epi16(in0, in3);
+    const __m128i r1 = _mm_add_epi16(in1, in2);
+    const __m128i r2 = _mm_sub_epi16(in1, in2);
+    const __m128i r3 = _mm_sub_epi16(in0, in3);
+    // Transform 1/2: Interleave to do the multiply by constants which gets us
+    //                into 32 bits.
+    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+    // Combine and transpose
+    const __m128i res0 = _mm_packs_epi32(w0, w2);
+    const __m128i res1 = _mm_packs_epi32(w4, w6);
+    // 00 01 02 03 20 21 22 23
+    // 10 11 12 13 30 31 32 33
+    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
+    // 00 10 01 11 02 12 03 13
+    // 20 30 21 31 22 32 23 33
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
+    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
+    if (0 == pass) {
+      // Extract values in the high part for second pass as transform code
+      // only uses the first four values.
+      in1 = _mm_unpackhi_epi64(in0, in0);
+      in3 = _mm_unpackhi_epi64(in2, in2);
+    } else {
+      // Post-condition output and store it (v + 1) >> 2, taking advantage
+      // of the fact 1/3 are stored just after 0/2.
+      __m128i out01 = _mm_add_epi16(in0, kOne);
+      __m128i out23 = _mm_add_epi16(in2, kOne);
+      out01 = _mm_srai_epi16(out01, 2);
+      out23 = _mm_srai_epi16(out23, 2);
+      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
+    }
+  }
+}
+
+void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
+  vp9_short_fdct4x4_sse2(input, output, pitch);
+  vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
+}
+
 void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
   const int stride = pitch >> 1;
   int pass;
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 43dba1373..13785f71b 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -90,7 +90,6 @@ VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm

From 3810bca9a9cfac4e3f3beb3d4a26a065102e3593 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Wed, 17 Apr 2013 10:52:50 -0700
Subject: [PATCH 10/19] Fix Android ndk-build

Add the config directory to the rtcd generation script. libvpx is
configured in the jni directory but ndk-build is intended to be run from
the next directory up. Currently it needs to be run from the jni
directory but this is being looked in to.

Add a trailing slash to allow the variable to be empty.

Reduce offset generation to the files which are actually used.

Change-Id: Ia84fac37e8998ba647423d0ee45fc66a891ce10c
---
 build/make/Android.mk | 38 ++++++++++++++++++++------------------
 libs.mk               |  2 +-
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/build/make/Android.mk b/build/make/Android.mk
index cf6221017..1ff0884fc 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -48,7 +48,7 @@
 # Running ndk-build will build libvpx and include it in your project.
 #
 
-CONFIG_DIR := $(LOCAL_PATH)
+CONFIG_DIR := $(LOCAL_PATH)/
 LIBVPX_PATH := $(LOCAL_PATH)/libvpx
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
 ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
@@ -56,9 +56,9 @@ ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
 # Makefiles created by the libvpx configure process
 # This will need to be fixed to handle x86.
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
-  include $(CONFIG_DIR)/libs-armv7-android-gcc.mk
+  include $(CONFIG_DIR)libs-armv7-android-gcc.mk
 else
-  include $(CONFIG_DIR)/libs-armv5te-android-gcc.mk
+  include $(CONFIG_DIR)libs-armv5te-android-gcc.mk
 endif
 
 # Rule that is normally in Makefile created by libvpx
@@ -106,26 +106,25 @@ $$(eval $$(call ev-build-file))
 
 $(1) : $$(_OBJ) $(2)
 	@mkdir -p $$(dir $$@)
-	@grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)/$(ASM_CONVERSION) > $$@
+	@grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)$(ASM_CONVERSION) > $$@
 endef
 
 # Use ads2gas script to convert from RVCT format to GAS format.  This passes
 #  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
 #  to handle removing these
-ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm
-ifeq ($(CONFIG_VP8_DECODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm
-endif
 ifeq ($(CONFIG_VP8_ENCODER), yes)
   ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
 endif
+ifeq ($(HAVE_NEON), yes)
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm
+endif
 
 .PRECIOUS: %.asm.s
 $(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
 	@mkdir -p $(dir $@)
-	@$(CONFIG_DIR)/$(ASM_CONVERSION) <$< > $@
+	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
-# For building vpx_rtcd.h, which has a rule in libs.mk
+# For building *_rtcd.h, which have rules in libs.mk
 TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
 target := libs
 
@@ -177,7 +176,14 @@ ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
 endif
 
-$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_rtcd.h
+# Add a dependency to force generation of the RTCD files.
+ifeq ($(CONFIG_VP8), yes)
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp8_rtcd.h
+endif
+ifeq ($(CONFIG_VP9), yes)
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp9_rtcd.h
+endif
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h
 
 .PHONY: clean
 clean:
@@ -189,14 +195,10 @@ clean:
 
 include $(BUILD_SHARED_LIBRARY)
 
-$(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/vp8_asm_com_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/common/vp8_asm_com_offsets.c))
-
-ifeq ($(CONFIG_VP8_DECODER), yes)
+ifeq ($(HAVE_NEON), yes)
   $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/vp8_asm_dec_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/decoder/vp8_asm_dec_offsets.c))
+    $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm, \
+    $(LIBVPX_PATH)/vpx_scale/vpx_scale_asm_offsets.c))
 endif
 
 ifeq ($(CONFIG_VP8_ENCODER), yes)
diff --git a/libs.mk b/libs.mk
index 2281bd059..adcde33f7 100644
--- a/libs.mk
+++ b/libs.mk
@@ -51,7 +51,7 @@ $$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2)
 	@echo "    [CREATE] $$@"
 	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$$(TGT_ISA) \
           --sym=$(1) \
-          --config=$$(target)$$(if $$(FAT_ARCHS),,-$$(TOOLCHAIN)).mk \
+          --config=$$(CONFIG_DIR)$$(target)$$(if $$(FAT_ARCHS),,-$$(TOOLCHAIN)).mk \
           $$(RTCD_OPTIONS) $$^ > $$@
 CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h

From 2bb8ecad02e2dd1edd6495b2eb8162eb591ba171 Mon Sep 17 00:00:00 2001
From: Frank Galligan <fgalligan@google.com>
Date: Wed, 17 Apr 2013 15:46:12 -0700
Subject: [PATCH 11/19] libvpx: Fix vp9 clang build.

- UNINITIALIZED_IS_SAFE Macro triggers a warning in Clang for
  structs.

Change-Id: Ib02c82f1fede7826564e17ccb7171c6fb18b8e44
---
 vp9/decoder/vp9_decodframe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index fea6433b2..eb1b4896e 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -1550,7 +1550,7 @@ static void decode_tiles(VP9D_COMP *pbi,
   if (pbi->oxcf.inv_tile_order) {
     const int n_cols = pc->tile_columns;
     const uint8_t *data_ptr2[4][1 << 6];
-    BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak);
+    BOOL_DECODER bc_bak = {0};
 
     // pre-initialize the offsets, we're going to read in inverse order
     data_ptr2[0][0] = data_ptr;

From dbd050c59f1969405958d00421af42fb0ad33a2a Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 18 Apr 2013 13:01:57 -0700
Subject: [PATCH 12/19] vpxdec: correct VP[89] fourccs

should have no effect as they are used in nestegg mappings, but aligns
the defines with vpxenc.c

Change-Id: Ic2295cd63701894c2963274239602b54cbb58631
---
 vpxdec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vpxdec.c b/vpxdec.c
index 41c654fae..df0b8199c 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -49,8 +49,8 @@
 
 static const char *exec_name;
 
-#define VP8_FOURCC (0x00385056)
-#define VP9_FOURCC (0x00395056)
+#define VP8_FOURCC (0x30385056)
+#define VP9_FOURCC (0x30395056)
 static const struct {
   char const *name;
   const vpx_codec_iface_t *(*iface)(void);

From ac980b71cff73a80c2ea020aeed3520731cb37cf Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Tue, 23 Apr 2013 09:55:03 -0700
Subject: [PATCH 13/19] Improve sign consistency.

Fix warning on windows: signed/unsigned mismatch on lines 415, 454

Comparison was between size_t data_sz >= int index_sz on 415 and
unsigned int data_sz >= int index_sz on 454. Both might be changed to
size_t but that would be tracing and replacing all comparisons is
outside the scope of this change.

In the rest of these two functions ensure unsigned values are used
consistently.

Change-Id: I922b399ceca612a92f44b9d1d331c1c6bae9d768
---
 vp9/vp9_dx_iface.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index d0c23f07a..69f08a7af 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -408,9 +408,9 @@ static void parse_superframe_index(const uint8_t *data,
   *count = 0;
 
   if ((marker & 0xe0) == 0xc0) {
-    const int frames = (marker & 0x7) + 1;
-    const int mag = ((marker >> 3) & 3) + 1;
-    const int index_sz = 2 + mag  * frames;
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
 
     if (data_sz >= index_sz && data[data_sz - index_sz] == marker) {
       // found a valid superframe index
@@ -418,7 +418,7 @@ static void parse_superframe_index(const uint8_t *data,
       const uint8_t *x = data + data_sz - index_sz + 1;
 
       for (i = 0; i < frames; i++) {
-        int this_sz = 0;
+        uint32_t this_sz = 0;
 
         for (j = 0; j < mag; j++)
           this_sz |= (*x++) << (j * 8);
@@ -447,9 +447,9 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,
     // Skip over the superframe index, if present
     if (data_sz && (*data_start & 0xe0) == 0xc0) {
       const uint8_t marker = *data_start;
-      const int frames = (marker & 0x7) + 1;
-      const int mag = ((marker >> 3) & 3) + 1;
-      const int index_sz = 2 + mag  * frames;
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const uint32_t index_sz = 2 + mag * frames;
 
       if (data_sz >= index_sz && data_start[index_sz - 1] == marker) {
         data_start += index_sz;

From 7af58d43389a3ef1b5c900dd8848c824c57b666f Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Tue, 23 Apr 2013 10:10:10 -0700
Subject: [PATCH 14/19] Resolve declaration and implementation.

Clean Windows build warnings:
warning C4028: formal parameter <N> different from declaration

This was fixed independently in master and experimental but the fixes
were in opposite directions. One added const to the declaration and the
other removed it from the implementation.

Also update the variable names. This doesn't modify the data so call it
ref, matching the functions in the vicinity, rather than dst.

Change-Id: I2ffc6b4a874cb98c26487b909d20a5e099b5582c
---
 vp9/common/vp9_findnearmv.c | 20 ++++++++++----------
 vp9/common/vp9_rtcd_defs.sh |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index f6d6932cc..a063ca477 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -77,11 +77,11 @@ unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,
 }
 
 unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
-                                          int  src_pixels_per_line,
+                                          int  source_stride,
                                           int  xoffset,
                                           int  yoffset,
-                                          const uint8_t *dst_ptr,
-                                          int dst_pixels_per_line,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
                                           unsigned int *sse) {
   uint16_t FData3[16 * 3];  // Temp data buffer used in filtering
   uint8_t temp2[2 * 16];
@@ -91,18 +91,18 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
   VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3,
-                                    src_pixels_per_line, 1, 3, 16, HFilter);
+                                    source_stride, 1, 3, 16, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);
 
-  return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+  return vp9_variance16x2_c(temp2, 16, ref_ptr, ref_stride, sse);
 }
 
 unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
-                                          int  src_pixels_per_line,
+                                          int  source_stride,
                                           int  xoffset,
                                           int  yoffset,
-                                          const uint8_t *dst_ptr,
-                                          int dst_pixels_per_line,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
                                           unsigned int *sse) {
   uint16_t FData3[2 * 17];  // Temp data buffer used in filtering
   uint8_t temp2[2 * 16];
@@ -112,10 +112,10 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
   VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3,
-                                    src_pixels_per_line, 1, 17, 2, HFilter);
+                                    source_stride, 1, 17, 2, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);
 
-  return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
+  return vp9_variance2x16_c(temp2, 2, ref_ptr, ref_stride, sse);
 }
 
 #if CONFIG_USESELECTREFMV
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 46495cb11..43bc3cb1f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -252,7 +252,7 @@ specialize vp9_sad16x3 sse2
 prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad3x16 sse2
 
-prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, const int source_stride, const int xoffset, const int  yoffset, const uint8_t *ref_ptr, const int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x2 sse2
 
 #

From fe74c4286aca842aee5f89c76385e7d16018443e Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Wed, 24 Apr 2013 09:08:56 -0700
Subject: [PATCH 15/19] Rename quantize_sse2_intrinsics.c

The only reason for the _intrinsics part of the file name was for the
interim period where only one of the functions was redone and the base
file name was the same.

Change-Id: I7851154f1633d48821bee885b1cadb2148e65a23
---
 .../x86/{quantize_sse2_intrinsics.c => quantize_sse2.c}     | 0
 vp8/vp8cx.mk                                                | 6 +++---
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename vp8/encoder/x86/{quantize_sse2_intrinsics.c => quantize_sse2.c} (100%)

diff --git a/vp8/encoder/x86/quantize_sse2_intrinsics.c b/vp8/encoder/x86/quantize_sse2.c
similarity index 100%
rename from vp8/encoder/x86/quantize_sse2_intrinsics.c
rename to vp8/encoder/x86/quantize_sse2.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index ca9f6a62e..7d1904aaf 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,12 +89,12 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 
 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2_intrinsics.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2_intrinsics.c.d: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
 endif
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)

From 53a4620271975ef41b0d71181849f7291bd5a587 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 24 Apr 2013 14:26:35 -0700
Subject: [PATCH 16/19] Change default iOS dev path

This can be manually overridden with --libc=

Change-Id: I0b857c751d5dc5423f79785e934bc8a714758e75
---
 build/make/configure.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 4d0cad23e..23dc87f31 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -918,7 +918,7 @@ process_common_toolchain() {
             add_ldflags -arch_only ${tgt_isa}
 
             if [ -z "${alt_libc}" ]; then
-                alt_libc=${SDK_PATH}/SDKs/iPhoneOS5.1.sdk
+                alt_libc=${SDK_PATH}/SDKs/iPhoneOS6.0.sdk
             fi
 
             add_cflags  "-isysroot ${alt_libc}"

From c5b127afea85176e4b43c525eac238924e4d2116 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Thu, 25 Apr 2013 11:13:02 -0700
Subject: [PATCH 17/19] Rename vp9_idct_x86.c

Remove similarly named header file. It is obsolete.

Move file to match naming style.

Adjust make file to include the file correctly and remove extra
unnecessary #if guard.

Change-Id: Ifba07ba9938a5df08a9f4eda54a3ac4d6983f7bf
---
 ...{vp9_idct_x86.c => vp9_idct_intrin_sse2.c} |  2 -
 vp9/common/x86/vp9_idct_x86.h                 | 51 -------------------
 vp9/vp9_common.mk                             |  7 ++-
 3 files changed, 3 insertions(+), 57 deletions(-)
 rename vp9/common/x86/{vp9_idct_x86.c => vp9_idct_intrin_sse2.c} (99%)
 delete mode 100644 vp9/common/x86/vp9_idct_x86.h

diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
similarity index 99%
rename from vp9/common/x86/vp9_idct_x86.c
rename to vp9/common/x86/vp9_idct_intrin_sse2.c
index 811ed9899..dd7e68aa3 100644
--- a/vp9/common/x86/vp9_idct_x86.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-#if HAVE_SSE2
 // In order to improve performance, clip absolute diff values to [0, 255],
 // which allows to keep the additions/subtractions in 8 bits.
 void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
@@ -1972,4 +1971,3 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
     }
   }
 }
-#endif
diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h
deleted file mode 100644
index bd66d8c72..000000000
--- a/vp9/common/x86/vp9_idct_x86.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_IDCT_X86_H_
-#define VP9_COMMON_X86_VP9_IDCT_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
-
-#undef vp9_idct_iwalsh1
-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx
-
-#endif
-#endif
-
-#if HAVE_SSE2
-
-extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2
-
-#endif
-
-#endif
-
-
-
-#endif
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 5e1ff62f7..f5a4103f6 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -82,7 +82,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
@@ -112,13 +111,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 endif
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
 ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2
+vp9/common/x86/vp9_idct_intrin_sse2.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2
+vp9/common/x86/vp9_idct_intrin_sse2.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2
 endif

From 863601c58968a5816ff17722432b5eedc6e13eae Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Thu, 25 Apr 2013 23:26:20 -0700
Subject: [PATCH 18/19] Normalize more intrinsic filenames

vp9_dequantize_x86 has only sse2 functions.

vp9_dct_sse2_intrinsics has no namespace collision and can drop
_intrinsics.

vp9_idct_mmx.h is unused.

Change-Id: Ic16e31fb372a1d1e841a62ecb4189fe8f95808ec
---
 ...dequantize_x86.c => vp9_dequantize_sse2.c} |  3 ---
 vp9/decoder/x86/vp9_idct_mmx.h                | 22 -------------------
 ...9_dct_sse2_intrinsics.c => vp9_dct_sse2.c} |  0
 vp9/vp9cx.mk                                  |  6 ++---
 vp9/vp9dx.mk                                  |  6 ++---
 5 files changed, 6 insertions(+), 31 deletions(-)
 rename vp9/decoder/x86/{vp9_dequantize_x86.c => vp9_dequantize_sse2.c} (99%)
 delete mode 100644 vp9/decoder/x86/vp9_idct_mmx.h
 rename vp9/encoder/x86/{vp9_dct_sse2_intrinsics.c => vp9_dct_sse2.c} (100%)

diff --git a/vp9/decoder/x86/vp9_dequantize_x86.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
similarity index 99%
rename from vp9/decoder/x86/vp9_dequantize_x86.c
rename to vp9/decoder/x86/vp9_dequantize_sse2.c
index acfae2a27..1dfb8e08f 100644
--- a/vp9/decoder/x86/vp9_dequantize_x86.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,8 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-#if HAVE_SSE2
-
 void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred,
                                int pitch, uint8_t *dest, int stride) {
   const int width = 4;
@@ -452,4 +450,3 @@ void vp9_add_constant_residual_32x32_sse2(const int16_t diff,
     dest += 4 * stride;
   } while (--i);
 }
-#endif
diff --git a/vp9/decoder/x86/vp9_idct_mmx.h b/vp9/decoder/x86/vp9_idct_mmx.h
deleted file mode 100644
index 7d9829175..000000000
--- a/vp9/decoder/x86/vp9_idct_mmx.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_X86_VP9_IDCT_MMX_H_
-#define VP9_DECODER_X86_VP9_IDCT_MMX_H_
-
-
-void vp9_dequant_dc_idct_add_mmx(short *input, const short *dq,
-                                 unsigned char *pred, unsigned char *dest,
-                                 int pitch, int stride, int Dc);
-
-void vp9_dequant_idct_add_mmx(short *input, const short *dq, unsigned char *pred,
-                              unsigned char *dest, int pitch, int stride);
-
-#endif /* VP9_DECODER_X86_VP9_IDCT_MMX_H_ */
diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2.c
similarity index 100%
rename from vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
rename to vp9/encoder/x86/vp9_dct_sse2.c
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 13785f71b..51e24b846 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -109,10 +109,10 @@ VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2_intrinsics.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 ifeq ($(HAVE_SSE2),yes)
-vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.d: CFLAGS += -msse2
-vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.o: CFLAGS += -msse2
+vp9/encoder/x86/vp9_dct_sse2.c.d: CFLAGS += -msse2
+vp9/encoder/x86/vp9_dct_sse2.c.o: CFLAGS += -msse2
 endif
 
 
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 239ae30b6..5cab6fc1f 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -38,10 +38,10 @@ VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
 
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
 
-VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/vp9_dequantize_x86.c
+VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
 ifeq ($(HAVE_SSE2),yes)
-vp9/decoder/x86/vp9_dequantize_x86.c.o: CFLAGS += -msse2
-vp9/decoder/x86/vp9_dequantize_x86.c.d: CFLAGS += -msse2
+vp9/decoder/x86/vp9_dequantize_sse2.c.o: CFLAGS += -msse2
+vp9/decoder/x86/vp9_dequantize_sse2.c.d: CFLAGS += -msse2
 endif
 
 $(eval $(call asm_offsets_template,\

From e58852582d9b785e09dd202c302881a5faf941d2 Mon Sep 17 00:00:00 2001
From: "changjun.yang" <changjun.yang@intel.com>
Date: Fri, 26 Apr 2013 15:16:42 +0800
Subject: [PATCH 19/19] code cleanup for arm_cpudetect.c

Change-Id: I5c49a983ced45197e1035fa5615d71b0bdad4109
---
 vpx_ports/arm_cpudetect.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index 3c916f247..542ff6786 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@ -53,8 +53,6 @@ int arm_cpu_caps(void) {
   return flags & mask;
 }
 
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
-
 #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
 #define WIN32_LEAN_AND_MEAN