Replace copy_memNxM functions with a generic copy/avg function.

Change-Id: I3ce849452ed4f08527de9565a9914d5ee36170aa
2013-07-10 11:17:19 -07:00 · 2013-07-10 11:17:19 -07:00 · decead7336
commit decead7336
parent c13e0bcb52
10 changed files with 220 additions and 570 deletions
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@ -22,8 +22,8 @@ extern "C" {
 }
 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, int dst_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
                              const int16_t *filter_y, int filter_y_stride,
                              int w, int h);
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@ -38,8 +38,8 @@
 */
 #define ALIGN_FILTERS_256 1
-static void convolve_horiz_c(const uint8_t *src, int src_stride,
+static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, int dst_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x0, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h, int taps) {
@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
  }
 }
-static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, int dst_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x0, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int taps) {
@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
  }
 }
-static void convolve_vert_c(const uint8_t *src, int src_stride,
+static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, int dst_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y0, int y_step_q4,
                            int w, int h, int taps) {
@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
  }
 }
-static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
+static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, int dst_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y0, int y_step_q4,
                                int w, int h, int taps) {
@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
  }
 }
-static void convolve_c(const uint8_t *src, int src_stride,
+static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, int dst_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
                       const int16_t *filter_x, int x_step_q4,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h, int taps) {
@ -237,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride,
                  w, h, taps);
 }
-static void convolve_avg_c(const uint8_t *src, int src_stride,
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, int dst_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h, int taps) {
@ -267,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
                      w, h, taps);
 }
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, int dst_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
@ -277,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
                   w, h, 8);
 }
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, int dst_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@ -287,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
                       w, h, 8);
 }
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, int dst_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
@ -297,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
                  w, h, 8);
 }
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, int dst_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
@ -307,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
                      w, h, 8);
 }
-void vp9_convolve8_c(const uint8_t *src, int src_stride,
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, int dst_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
                     const int16_t *filter_x, int x_step_q4,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
@ -317,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride,
             w, h, 8);
 }
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, int dst_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
@ -339,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
                   w, h);
 }
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, int dst_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
+                         const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
+                         const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
+                         int w, int h) {
-  if (w == 16 && h == 16) {
+  int r;
    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
  } else if (w == 8 && h == 8) {
    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
  } else if (w == 8 && h == 4) {
    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
  } else {
    int r;
-    for (r = h; r > 0; --r) {
+  for (r = h; r > 0; --r) {
-      memcpy(dst, src, w);
+    memcpy(dst, src, w);
-      src += src_stride;
+    src += src_stride;
-      dst += dst_stride;
+    dst += dst_stride;
    }
  }
 }
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                      uint8_t *dst, int dst_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
-                      const int16_t *filter_x, int filter_x_stride,
+                        const int16_t *filter_x, int filter_x_stride,
-                      const int16_t *filter_y, int filter_y_stride,
+                        const int16_t *filter_y, int filter_y_stride,
-                      int w, int h) {
+                        int w, int h) {
  int x, y;
  for (y = 0; y < h; ++y) {
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@ -13,26 +13,12 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, int dst_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h);
 // Not a convolution, a block copy conforming to the convolution prototype
 void vp9_convolve_copy(const uint8_t *src, int src_stride,
                       uint8_t *dst, int dst_stride,
                       const int16_t *filter_x, int x_step_q4,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h);
 // Not a convolution, a block average conforming to the convolution prototype
 void vp9_convolve_avg(const uint8_t *src, int src_stride,
                      uint8_t *dst, int dst_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h);
 struct subpix_fn_table {
  const int16_t (*filter_x)[8];
  const int16_t (*filter_y)[8];
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@ -194,93 +194,6 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 void vp9_copy_mem16x16_c(const uint8_t *src,
                         int src_stride,
                         uint8_t *dst,
                         int dst_stride) {
  int r;
  for (r = 0; r < 16; r++) {
 #if !(CONFIG_FAST_UNALIGNED)
    dst[0] = src[0];
    dst[1] = src[1];
    dst[2] = src[2];
    dst[3] = src[3];
    dst[4] = src[4];
    dst[5] = src[5];
    dst[6] = src[6];
    dst[7] = src[7];
    dst[8] = src[8];
    dst[9] = src[9];
    dst[10] = src[10];
    dst[11] = src[11];
    dst[12] = src[12];
    dst[13] = src[13];
    dst[14] = src[14];
    dst[15] = src[15];
 #else
    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
 #endif
    src += src_stride;
    dst += dst_stride;
  }
 }
 void vp9_copy_mem8x8_c(const uint8_t *src,
                       int src_stride,
                       uint8_t *dst,
                       int dst_stride) {
  int r;
  for (r = 0; r < 8; r++) {
 #if !(CONFIG_FAST_UNALIGNED)
    dst[0] = src[0];
    dst[1] = src[1];
    dst[2] = src[2];
    dst[3] = src[3];
    dst[4] = src[4];
    dst[5] = src[5];
    dst[6] = src[6];
    dst[7] = src[7];
 #else
    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 #endif
    src += src_stride;
    dst += dst_stride;
  }
 }
 void vp9_copy_mem8x4_c(const uint8_t *src,
                       int src_stride,
                       uint8_t *dst,
                       int dst_stride) {
  int r;
  for (r = 0; r < 4; r++) {
 #if !(CONFIG_FAST_UNALIGNED)
    dst[0] = src[0];
    dst[1] = src[1];
    dst[2] = src[2];
    dst[3] = src[3];
    dst[4] = src[4];
    dst[5] = src[5];
    dst[6] = src[6];
    dst[7] = src[7];
 #else
    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 #endif
    src += src_stride;
    dst += dst_stride;
  }
 }
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const int_mv *src_mv,
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -43,17 +43,6 @@ specialize vp9_idct_add_32x32
 #
 # RECON
 #
 prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem16x16 mmx sse2 dspr2
 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x8 mmx dspr2
 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d27_predictor_4x4
@ -275,22 +264,28 @@ specialize vp9_blend_b
 #
 # Sub Pixel Filters
 #
-prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve_copy sse2
 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve_avg sse2
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8 ssse3
-prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_horiz ssse3
-prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_vert ssse3
-prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg ssse3
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_horiz ssse3
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_vert ssse3
 #
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@ -121,8 +121,8 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
                                     unsigned int output_height,
                                     const short *filter);
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, int dst_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@ -159,8 +159,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
  }
 }
-void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, int dst_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
@ -197,8 +197,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
  }
 }
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, int dst_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@ -235,8 +235,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
  }
 }
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, int dst_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
@ -273,8 +273,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
  }
 }
-void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
+void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, int dst_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
@ -294,8 +294,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
  }
 }
-void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
+void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, int dst_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
--- a/vp9/common/x86/vp9_copy_sse2.asm
+++ b/vp9/common/x86/vp9_copy_sse2.asm
@ -0,0 +1,152 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "third_party/x86inc/x86inc.asm"
 SECTION .text
 %macro convolve_fn 1
 INIT_XMM sse2
 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
                              fx, fxs, fy, fys, w, h
  mov r4d, dword wm
  cmp r4d, 4
  je .w4
  cmp r4d, 8
  je .w8
  cmp r4d, 16
  je .w16
  cmp r4d, 32
  je .w32
  mov                    r4d, dword hm
 .loop64:
  movu                    m0, [srcq]
  movu                    m1, [srcq+16]
  movu                    m2, [srcq+32]
  movu                    m3, [srcq+48]
  add                   srcq, src_strideq
 %ifidn %1, avg
  pavgb                   m0, [dstq]
  pavgb                   m1, [dstq+16]
  pavgb                   m2, [dstq+32]
  pavgb                   m3, [dstq+48]
 %endif
  mova             [dstq   ], m0
  mova             [dstq+16], m1
  mova             [dstq+32], m2
  mova             [dstq+48], m3
  add                   dstq, dst_strideq
  dec                    r4d
  jnz .loop64
  RET
 .w32:
  mov                    r4d, dword hm
 .loop32:
  movu                    m0, [srcq]
  movu                    m1, [srcq+16]
  movu                    m2, [srcq+src_strideq]
  movu                    m3, [srcq+src_strideq+16]
  lea                   srcq, [srcq+src_strideq*2]
 %ifidn %1, avg
  pavgb                   m0, [dstq]
  pavgb                   m1, [dstq            +16]
  pavgb                   m2, [dstq+dst_strideq]
  pavgb                   m3, [dstq+dst_strideq+16]
 %endif
  mova [dstq               ], m0
  mova [dstq            +16], m1
  mova [dstq+dst_strideq   ], m2
  mova [dstq+dst_strideq+16], m3
  lea                   dstq, [dstq+dst_strideq*2]
  sub                    r4d, 2
  jnz .loop32
  RET
 .w16:
  mov                    r4d, dword hm
  lea                    r5q, [src_strideq*3]
  lea                    r6q, [dst_strideq*3]
 .loop16:
  movu                    m0, [srcq]
  movu                    m1, [srcq+src_strideq]
  movu                    m2, [srcq+src_strideq*2]
  movu                    m3, [srcq+r5q]
  lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
  pavgb                   m0, [dstq]
  pavgb                   m1, [dstq+dst_strideq]
  pavgb                   m2, [dstq+dst_strideq*2]
  pavgb                   m3, [dstq+r6q]
 %endif
  mova  [dstq              ], m0
  mova  [dstq+dst_strideq  ], m1
  mova  [dstq+dst_strideq*2], m2
  mova  [dstq+r6q          ], m3
  lea                   dstq, [dstq+dst_strideq*4]
  sub                    r4d, 4
  jnz .loop16
  RET
 INIT_MMX sse
 .w8:
  mov                    r4d, dword hm
  lea                    r5q, [src_strideq*3]
  lea                    r6q, [dst_strideq*3]
 .loop8:
  movu                    m0, [srcq]
  movu                    m1, [srcq+src_strideq]
  movu                    m2, [srcq+src_strideq*2]
  movu                    m3, [srcq+r5q]
  lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
  pavgb                   m0, [dstq]
  pavgb                   m1, [dstq+dst_strideq]
  pavgb                   m2, [dstq+dst_strideq*2]
  pavgb                   m3, [dstq+r6q]
 %endif
  mova  [dstq              ], m0
  mova  [dstq+dst_strideq  ], m1
  mova  [dstq+dst_strideq*2], m2
  mova  [dstq+r6q          ], m3
  lea                   dstq, [dstq+dst_strideq*4]
  sub                    r4d, 4
  jnz .loop8
  RET
 .w4:
  mov                    r4d, dword hm
  lea                    r5q, [src_strideq*3]
  lea                    r6q, [dst_strideq*3]
 .loop4:
  movh                    m0, [srcq]
  movh                    m1, [srcq+src_strideq]
  movh                    m2, [srcq+src_strideq*2]
  movh                    m3, [srcq+r5q]
  lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
  pavgb                   m0, [dstq]
  pavgb                   m1, [dstq+dst_strideq]
  pavgb                   m2, [dstq+dst_strideq*2]
  pavgb                   m3, [dstq+r6q]
 %endif
  movh  [dstq              ], m0
  movh  [dstq+dst_strideq  ], m1
  movh  [dstq+dst_strideq*2], m2
  movh  [dstq+r6q          ], m3
  lea                   dstq, [dstq+dst_strideq*4]
  sub                    r4d, 4
  jnz .loop4
  RET
 %endmacro
 convolve_fn copy
 convolve_fn avg
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ b/vp9/common/x86/vp9_recon_mmx.asm
@ -1,272 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem8x8_mmx) PRIVATE
 sym(vp9_copy_mem8x8_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
    push        rsi
    push        rdi
    ; end prolog
        mov         rsi,        arg(0) ;src;
        movq        mm0,        [rsi]
        movsxd      rax,        dword ptr arg(1) ;src_stride;
        mov         rdi,        arg(2) ;dst;
        movq        mm1,        [rsi+rax]
        movq        mm2,        [rsi+rax*2]
        movsxd      rcx,        dword ptr arg(3) ;dst_stride
        lea         rsi,        [rsi+rax*2]
        movq        [rdi],      mm0
        add         rsi,        rax
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx*2],    mm2
        lea         rdi,        [rdi+rcx*2]
        movq        mm3,        [rsi]
        add         rdi,        rcx
        movq        mm4,        [rsi+rax]
        movq        mm5,        [rsi+rax*2]
        movq        [rdi],      mm3
        lea         rsi,        [rsi+rax*2]
        movq        [rdi+rcx],  mm4
        movq        [rdi+rcx*2],    mm5
        lea         rdi,        [rdi+rcx*2]
        movq        mm0,        [rsi+rax]
        movq        mm1,        [rsi+rax*2]
        movq        [rdi+rcx],  mm0
        movq        [rdi+rcx*2],mm1
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void copy_mem8x4_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem8x4_mmx) PRIVATE
 sym(vp9_copy_mem8x4_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
    push        rsi
    push        rdi
    ; end prolog
        mov         rsi,        arg(0) ;src;
        movq        mm0,        [rsi]
        movsxd      rax,        dword ptr arg(1) ;src_stride;
        mov         rdi,        arg(2) ;dst;
        movq        mm1,        [rsi+rax]
        movq        mm2,        [rsi+rax*2]
        movsxd      rcx,        dword ptr arg(3) ;dst_stride
        lea         rsi,        [rsi+rax*2]
        movq        [rdi],      mm0
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx*2],    mm2
        lea         rdi,        [rdi+rcx*2]
        movq        mm3,        [rsi+rax]
        movq        [rdi+rcx],      mm3
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void copy_mem16x16_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem16x16_mmx) PRIVATE
 sym(vp9_copy_mem16x16_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
    push        rsi
    push        rdi
    ; end prolog
        mov         rsi,        arg(0) ;src;
        movsxd      rax,        dword ptr arg(1) ;src_stride;
        mov         rdi,        arg(2) ;dst;
        movsxd      rcx,        dword ptr arg(3) ;dst_stride
        movq        mm0,            [rsi]
        movq        mm3,            [rsi+8];
        movq        mm1,            [rsi+rax]
        movq        mm4,            [rsi+rax+8]
        movq        mm2,            [rsi+rax*2]
        movq        mm5,            [rsi+rax*2+8]
        lea         rsi,            [rsi+rax*2]
        add         rsi,            rax
        movq        [rdi],          mm0
        movq        [rdi+8],        mm3
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx+8],    mm4
        movq        [rdi+rcx*2],    mm2
        movq        [rdi+rcx*2+8],  mm5
        lea         rdi,            [rdi+rcx*2]
        add         rdi,            rcx
        movq        mm0,            [rsi]
        movq        mm3,            [rsi+8];
        movq        mm1,            [rsi+rax]
        movq        mm4,            [rsi+rax+8]
        movq        mm2,            [rsi+rax*2]
        movq        mm5,            [rsi+rax*2+8]
        lea         rsi,            [rsi+rax*2]
        add         rsi,            rax
        movq        [rdi],          mm0
        movq        [rdi+8],        mm3
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx+8],    mm4
        movq        [rdi+rcx*2],    mm2
        movq        [rdi+rcx*2+8],  mm5
        lea         rdi,            [rdi+rcx*2]
        add         rdi,            rcx
        movq        mm0,            [rsi]
        movq        mm3,            [rsi+8];
        movq        mm1,            [rsi+rax]
        movq        mm4,            [rsi+rax+8]
        movq        mm2,            [rsi+rax*2]
        movq        mm5,            [rsi+rax*2+8]
        lea         rsi,            [rsi+rax*2]
        add         rsi,            rax
        movq        [rdi],          mm0
        movq        [rdi+8],        mm3
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx+8],    mm4
        movq        [rdi+rcx*2],    mm2
        movq        [rdi+rcx*2+8],  mm5
        lea         rdi,            [rdi+rcx*2]
        add         rdi,            rcx
        movq        mm0,            [rsi]
        movq        mm3,            [rsi+8];
        movq        mm1,            [rsi+rax]
        movq        mm4,            [rsi+rax+8]
        movq        mm2,            [rsi+rax*2]
        movq        mm5,            [rsi+rax*2+8]
        lea         rsi,            [rsi+rax*2]
        add         rsi,            rax
        movq        [rdi],          mm0
        movq        [rdi+8],        mm3
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx+8],    mm4
        movq        [rdi+rcx*2],    mm2
        movq        [rdi+rcx*2+8],  mm5
        lea         rdi,            [rdi+rcx*2]
        add         rdi,            rcx
        movq        mm0,            [rsi]
        movq        mm3,            [rsi+8];
        movq        mm1,            [rsi+rax]
        movq        mm4,            [rsi+rax+8]
        movq        mm2,            [rsi+rax*2]
        movq        mm5,            [rsi+rax*2+8]
        lea         rsi,            [rsi+rax*2]
        add         rsi,            rax
        movq        [rdi],          mm0
        movq        [rdi+8],        mm3
        movq        [rdi+rcx],      mm1
        movq        [rdi+rcx+8],    mm4
        movq        [rdi+rcx*2],    mm2
        movq        [rdi+rcx*2+8],  mm5
        lea         rdi,            [rdi+rcx*2]
        add         rdi,            rcx
        movq        mm0,            [rsi]
        movq        mm3,            [rsi+8];
        movq        [rdi],          mm0
        movq        [rdi+8],        mm3
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ b/vp9/common/x86/vp9_recon_sse2.asm
@ -1,115 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;void copy_mem16x16_sse2(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
 global sym(vp9_copy_mem16x16_sse2) PRIVATE
 sym(vp9_copy_mem16x16_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
    push        rsi
    push        rdi
    ; end prolog
        mov         rsi,        arg(0) ;src;
        movdqu      xmm0,       [rsi]
        movsxd      rax,        dword ptr arg(1) ;src_stride;
        mov         rdi,        arg(2) ;dst;
        movdqu      xmm1,       [rsi+rax]
        movdqu      xmm2,       [rsi+rax*2]
        movsxd      rcx,        dword ptr arg(3) ;dst_stride
        lea         rsi,        [rsi+rax*2]
        movdqa      [rdi],      xmm0
        add         rsi,        rax
        movdqa      [rdi+rcx],  xmm1
        movdqa      [rdi+rcx*2],xmm2
        lea         rdi,        [rdi+rcx*2]
        movdqu      xmm3,       [rsi]
        add         rdi,        rcx
        movdqu      xmm4,       [rsi+rax]
        movdqu      xmm5,       [rsi+rax*2]
        lea         rsi,        [rsi+rax*2]
        movdqa      [rdi],  xmm3
        add         rsi,        rax
        movdqa      [rdi+rcx],  xmm4
        movdqa      [rdi+rcx*2],xmm5
        lea         rdi,        [rdi+rcx*2]
        movdqu      xmm0,       [rsi]
        add         rdi,        rcx
        movdqu      xmm1,       [rsi+rax]
        movdqu      xmm2,       [rsi+rax*2]
        lea         rsi,        [rsi+rax*2]
        movdqa      [rdi],      xmm0
        add         rsi,        rax
        movdqa      [rdi+rcx],  xmm1
        movdqa      [rdi+rcx*2],    xmm2
        movdqu      xmm3,       [rsi]
        movdqu      xmm4,       [rsi+rax]
        lea         rdi,        [rdi+rcx*2]
        add         rdi,        rcx
        movdqu      xmm5,       [rsi+rax*2]
        lea         rsi,        [rsi+rax*2]
        movdqa      [rdi],  xmm3
        add         rsi,        rax
        movdqa      [rdi+rcx],  xmm4
        movdqa      [rdi+rcx*2],xmm5
        movdqu      xmm0,       [rsi]
        lea         rdi,        [rdi+rcx*2]
        movdqu      xmm1,       [rsi+rax]
        add         rdi,        rcx
        movdqu      xmm2,       [rsi+rax*2]
        lea         rsi,        [rsi+rax*2]
        movdqa      [rdi],      xmm0
        movdqa      [rdi+rcx],  xmm1
        movdqa      [rdi+rcx*2],xmm2
        movdqu      xmm3,       [rsi+rax]
        lea         rdi,        [rdi+rcx*2]
        movdqa      [rdi+rcx],  xmm3
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -75,10 +75,9 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm