Restore SSSE3 subpixel filters in new convolve framework

This commit adds the 8 tap SSSE3 subpixel filters back into the code underneath the convolve API. The C code is still called for 4x4 blocks, as well as compound prediction modes. This restores the encode performance to be within about 8% of the baseline. Change-Id: Ife0d81477075ae33c05b53c65003951efdc8b09c
2013-02-07 17:00:37 -08:00
parent 7a07eea13f
commit 29d47ac80e
5 changed files with 723 additions and 6 deletions
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -12,6 +12,7 @@
 extern "C" {
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 }
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
@@ -166,7 +167,25 @@ static void filter_average_block2d_8_c(const uint8_t *src_ptr,
 }
 class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
 public:
  static void SetUpTestCase() {
    // Force input_ to be unaligned, output to be 16 byte aligned.
    input_ = reinterpret_cast<uint8_t*>(
        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
        + 1;
    output_ = reinterpret_cast<uint8_t*>(
        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
  }
  static void TearDownTestCase() {
    vpx_free(input_ - 1);
    input_ = NULL;
    vpx_free(output_);
    output_ = NULL;
  }
  protected:
    static const int kDataAlignment = 16;
    static const int kOuterBlockSize = 32;
    static const int kInputStride = kOuterBlockSize;
    static const int kOutputStride = kOuterBlockSize;
@@ -174,7 +193,10 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
    int Width() const { return GET_PARAM(0); }
    int Height() const { return GET_PARAM(1); }
-    int BorderLeft() const { return (kOuterBlockSize - Width()) / 2; }
+    int BorderLeft() const {
      const int center = (kOuterBlockSize - Width()) / 2;
      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
    }
    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
    bool IsIndexInBorder(int i) {
@@ -216,9 +238,11 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
    }
    const ConvolveFunctions* UUT_;
-    uint8_t input_[kOuterBlockSize * kOuterBlockSize];
+    static uint8_t* input_;
-    uint8_t output_[kOuterBlockSize * kOuterBlockSize];
+    static uint8_t* output_;
 };
 uint8_t* ConvolveTest::input_ = NULL;
 uint8_t* ConvolveTest::output_ = NULL;
 TEST_P(ConvolveTest, GuardBlocks) {
  CheckGuardBlocks();
@@ -488,3 +512,16 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
    make_tuple(8, 8, &convolve8_c),
    make_tuple(16, 16, &convolve8_c)));
 }
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
    vp9_convolve8_ssse3, vp9_convolve8_avg_c);
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
    make_tuple(4, 4, &convolve8_ssse3),
    make_tuple(8, 4, &convolve8_ssse3),
    make_tuple(8, 8, &convolve8_ssse3),
    make_tuple(16, 16, &convolve8_ssse3)));
 #endif
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -245,13 +245,13 @@ specialize vp9_sub_pixel_variance16x2 sse2
 # Sub Pixel Filters
 #
 prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8
+specialize vp9_convolve8 ssse3
 prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz
+specialize vp9_convolve8_horiz ssse3
 prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert
+specialize vp9_convolve8_vert ssse3
 prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,8 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx_ports/mem.h"
 ///////////////////////////////////////////////////////////////////////////
 // the mmx function that does the bilinear filtering and var calculation //
@@ -33,3 +35,130 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
  {  16, 16, 16, 16, 112, 112, 112, 112 },
  {   8,  8,  8,  8, 120, 120, 120, 120 }
 };
 #if HAVE_SSSE3
 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);
 void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);
 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);
 void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
  if (x_step_q4 == 16 && filter_x[3] != 128) {
    while (w >= 16) {
      vp9_filter_block1d16_h8_ssse3(src, src_stride,
                                    dst, dst_stride,
                                    h, filter_x);
      src += 16;
      dst += 16;
      w -= 16;
    }
    while (w >= 8) {
      vp9_filter_block1d8_h8_ssse3(src, src_stride,
                                   dst, dst_stride,
                                   h, filter_x);
      src += 8;
      dst += 8;
      w -= 8;
    }
  }
  if (w) {
    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                          filter_x, x_step_q4, filter_y, y_step_q4,
                          w, h);
  }
 }
 void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
  if (y_step_q4 == 16 && filter_y[3] != 128) {
    while (w >= 16) {
      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
                                    dst, dst_stride,
                                    h, filter_y);
      src += 16;
      dst += 16;
      w -= 16;
    }
    while (w >= 8) {
      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
                                   dst, dst_stride,
                                   h, filter_y);
      src += 8;
      dst += 8;
      w -= 8;
    }
  }
  if (w) {
    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
  }
 }
 void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                         uint8_t *dst, int dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
  // check w/h due to fixed size fdata2 array
  assert(w <= 16);
  assert(h <= 16);
  if (x_step_q4 == 16 && y_step_q4 == 16 &&
      filter_x[3] != 128 && filter_y[3] != 128) {
    if (w == 16) {
      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
                                    fdata2, 16,
                                    h + 7, filter_x);
      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
                                    dst, dst_stride,
                                    h, filter_y);
      return;
    }
    if (w == 8) {
      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
                                   fdata2, 16,
                                   h + 7, filter_x);
      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
                                   dst, dst_stride,
                                   h, filter_y);
      return;
    }
  }
  vp9_convolve8_c(src, src_stride, dst, dst_stride,
                  filter_x, x_step_q4, filter_y, y_step_q4,
                  w, h);
 }
 #endif
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -0,0 +1,550 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
 ; input pixel array has output_height rows. This routine assumes that output_height is an
 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 ; rows each iteration to take advantage of the 128 bits operations.
 ;
 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
 ;
 ;*************************************************************************************/
 ;void vp9_filter_block1d8_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
 ;    unsigned char *output_ptr,
 ;    unsigned int   out_pitch,
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
 global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_v8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    push        rsi
    push        rdi
    push        rbx
    ; end prolog
    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]
    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040
    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3
    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
    movdqa      krd, xmm5
    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
 %if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
 %endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    add         rax, rdx
    lea         rbx, [rdx + rdx*4]
    add         rbx, rdx                    ;pitch * 6
 .vp9_filter_block1d8_v8_ssse3_loop:
    movq        xmm0, [rsi]                 ;A
    movq        xmm1, [rsi + rdx]           ;B
    movq        xmm2, [rsi + rdx * 2]       ;C
    movq        xmm3, [rax + rdx * 2]       ;D
    movq        xmm4, [rsi + rdx * 4]       ;E
    movq        xmm5, [rax + rdx * 4]       ;F
    punpcklbw   xmm0, xmm1                  ;A B
    punpcklbw   xmm2, xmm3                  ;C D
    punpcklbw   xmm4, xmm5                  ;E F
    movq        xmm6, [rsi + rbx]           ;G
    movq        xmm7, [rax + rbx]           ;H
    pmaddubsw   xmm0, k0k1
    pmaddubsw   xmm2, k2k3
    punpcklbw   xmm6, xmm7                  ;G H
    pmaddubsw   xmm4, k4k5
    pmaddubsw   xmm6, k6k7
    paddsw      xmm0, xmm2
    paddsw      xmm0, krd
    paddsw      xmm4, xmm6
    paddsw      xmm0, xmm4
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    add         rsi,  rdx
    add         rax,  rdx
    movq        [rdi], xmm0
 %if ABI_IS_32BIT
    add         rdi, DWORD PTR arg(3)       ;out_pitch
 %else
    add         rdi, r8
 %endif
    dec         rcx
    jnz         .vp9_filter_block1d8_v8_ssse3_loop
    add rsp, 16*5
    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp9_filter_block1d16_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
 ;    unsigned char *output_ptr,
 ;    unsigned int   out_pitch,
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
 global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d16_v8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    push        rsi
    push        rdi
    push        rbx
    ; end prolog
    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]
    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040
    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3
    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
    movdqa      krd, xmm5
    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
 %if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
 %endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    add         rax, rdx
    lea         rbx, [rdx + rdx*4]
    add         rbx, rdx                    ;pitch * 6
 .vp9_filter_block1d16_v8_ssse3_loop:
    movq        xmm0, [rsi]                 ;A
    movq        xmm1, [rsi + rdx]           ;B
    movq        xmm2, [rsi + rdx * 2]       ;C
    movq        xmm3, [rax + rdx * 2]       ;D
    movq        xmm4, [rsi + rdx * 4]       ;E
    movq        xmm5, [rax + rdx * 4]       ;F
    punpcklbw   xmm0, xmm1                  ;A B
    punpcklbw   xmm2, xmm3                  ;C D
    punpcklbw   xmm4, xmm5                  ;E F
    movq        xmm6, [rsi + rbx]           ;G
    movq        xmm7, [rax + rbx]           ;H
    pmaddubsw   xmm0, k0k1
    pmaddubsw   xmm2, k2k3
    punpcklbw   xmm6, xmm7                  ;G H
    pmaddubsw   xmm4, k4k5
    pmaddubsw   xmm6, k6k7
    paddsw      xmm0, xmm2
    paddsw      xmm0, krd
    paddsw      xmm4, xmm6
    paddsw      xmm0, xmm4
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    movq        [rdi], xmm0
    movq        xmm0, [rsi + 8]             ;A
    movq        xmm1, [rsi + rdx + 8]       ;B
    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
    movq        xmm3, [rax + rdx * 2 + 8]   ;D
    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
    movq        xmm5, [rax + rdx * 4 + 8]   ;F
    punpcklbw   xmm0, xmm1                  ;A B
    punpcklbw   xmm2, xmm3                  ;C D
    punpcklbw   xmm4, xmm5                  ;E F
    movq        xmm6, [rsi + rbx + 8]       ;G
    movq        xmm7, [rax + rbx + 8]       ;H
    punpcklbw   xmm6, xmm7                  ;G H
    pmaddubsw   xmm0, k0k1
    pmaddubsw   xmm2, k2k3
    pmaddubsw   xmm4, k4k5
    pmaddubsw   xmm6, k6k7
    paddsw      xmm0, xmm2
    paddsw      xmm4, xmm6
    paddsw      xmm0, krd
    paddsw      xmm0, xmm4
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    add         rsi,  rdx
    add         rax,  rdx
    movq        [rdi+8], xmm0
 %if ABI_IS_32BIT
    add         rdi, DWORD PTR arg(3)       ;out_pitch
 %else
    add         rdi, r8
 %endif
    dec         rcx
    jnz         .vp9_filter_block1d16_v8_ssse3_loop
    add rsp, 16*5
    pop rsp
    pop rbx
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp9_filter_block1d8_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
 ;    unsigned char  *output_ptr,
 ;    unsigned int    output_pitch,
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
 global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_h8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]
    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040
    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3
    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
 ;    movdqa      krd, xmm5
    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
    movsxd      rdx, dword ptr arg(3)       ;output_pitch
    movsxd      rcx, dword ptr arg(4)       ;output_height
 .filter_block1d8_h8_rowloop_ssse3:
    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 ;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
 ;note: if we create a k0_k7 filter, we can save a pshufb
 ;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
    punpcklqdq  xmm0,   xmm3
    movdqa      xmm1,   xmm0
    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
    pmaddubsw   xmm0,   k0k1
    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
    pmaddubsw   xmm1,   k2k3
    movdqa      xmm4,   xmm2
    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
    pmaddubsw   xmm2,   k4k5
    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
    pmaddubsw   xmm4,   k6k7
    paddsw      xmm0,   xmm1
    paddsw      xmm0,   xmm2
    paddsw      xmm0,   xmm5
    paddsw      xmm0,   xmm4
    psraw       xmm0,   7
    packuswb    xmm0,   xmm0
    lea         rsi,    [rsi + rax]
    movq        [rdi],  xmm0
    lea         rdi,    [rdi + rdx]
    dec         rcx
    jnz         .filter_block1d8_h8_rowloop_ssse3
    add rsp, 16*5
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp9_filter_block1d16_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
 ;    unsigned char  *output_ptr,
 ;    unsigned int    output_pitch,
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
 global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
 sym(vp9_filter_block1d16_h8_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ALIGN_STACK 16, rax
    sub         rsp, 16*5
    %define k0k1 [rsp + 16*0]
    %define k2k3 [rsp + 16*1]
    %define k4k5 [rsp + 16*2]
    %define k6k7 [rsp + 16*3]
    %define krd [rsp + 16*4]
    mov         rdx, arg(5)                 ;filter ptr
    mov         rsi, arg(0)                 ;src_ptr
    mov         rdi, arg(2)                 ;output_ptr
    mov         rcx, 0x0400040
    movdqa      xmm4, [rdx]                 ;load filters
    movd        xmm5, rcx
    packsswb    xmm4, xmm4
    pshuflw     xmm0, xmm4, 0b              ;k0_k1
    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
    punpcklqdq  xmm0, xmm0
    punpcklqdq  xmm1, xmm1
    punpcklqdq  xmm2, xmm2
    punpcklqdq  xmm3, xmm3
    movdqa      k0k1, xmm0
    movdqa      k2k3, xmm1
    pshufd      xmm5, xmm5, 0
    movdqa      k4k5, xmm2
    movdqa      k6k7, xmm3
    movdqa      krd, xmm5
    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
    movsxd      rdx, dword ptr arg(3)       ;output_pitch
    movsxd      rcx, dword ptr arg(4)       ;output_height
 .filter_block1d16_h8_rowloop_ssse3:
    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 ;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
 ;note: if we create a k0_k7 filter, we can save a pshufb
 ;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
    punpcklqdq  xmm0,   xmm3
    movdqa      xmm1,   xmm0
    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
    pmaddubsw   xmm0,   k0k1
    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
    pmaddubsw   xmm1,   k2k3
    movdqa      xmm4,   xmm2
    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
    pmaddubsw   xmm2,   k4k5
    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
    pmaddubsw   xmm4,   k6k7
    paddsw      xmm0,   xmm1
    paddsw      xmm0,   xmm4
    paddsw      xmm0,   xmm2
    paddsw      xmm0,   krd
    psraw       xmm0,   7
    packuswb    xmm0,   xmm0
    movq        xmm3,   [rsi +  5]
 ;    movq        xmm7,   [rsi + 12]
    movq        xmm7,   [rsi + 13]
 ;note: same as above
 ;    punpcklbw   xmm3,   xmm7
    punpcklqdq  xmm3,   xmm7
    movdqa      xmm1,   xmm3
    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
    pmaddubsw   xmm3,   k0k1
    movdqa      xmm2,   xmm1
    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
    pmaddubsw   xmm1,   k2k3
    movdqa      xmm4,   xmm2
    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
    pmaddubsw   xmm2,   k4k5
    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
    pmaddubsw   xmm4,   k6k7
    paddsw      xmm3,   xmm1
    paddsw      xmm3,   xmm2
    paddsw      xmm3,   krd
    paddsw      xmm3,   xmm4
    psraw       xmm3,   7
    packuswb    xmm3,   xmm3
    punpcklqdq  xmm0,   xmm3
    lea         rsi,    [rsi + rax]
    movdqa      [rdi],  xmm0
    lea         rdi,    [rdi + rdx]
    dec         rcx
    jnz         .filter_block1d16_h8_rowloop_ssse3
    add rsp, 16*5
    pop rsp
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 shuf_t0t1:
    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 align 16
 shuf_t2t3:
    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
 align 16
 shuf_t4t5:
    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
 align 16
 shuf_t6t7:
    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -96,6 +96,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 ifeq ($(CONFIG_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm