WIP: ssse3 version of convolve avg functions

Initial ssse3 convolve avg functions and is one step closer to using x86inc.asm. The decoder performance improved by 8% for the test clip used. This should be revisited later to see if averaging outside the loop is better than having many similar filter functions. Change-Id: Ice3fafb423b02710b0448ffca18b296bcac649e9
2013-02-13 09:15:38 -08:00 · 2013-02-13 09:15:38 -08:00 · 30f866f44b
commit 30f866f44b
parent cb00be1fa2
3 changed files with 766 additions and 354 deletions
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -254,13 +254,13 @@ prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *
 specialize vp9_convolve8_vert ssse3

 prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg
+specialize vp9_convolve8_avg ssse3

 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz
+specialize vp9_convolve8_avg_horiz ssse3

 prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert
+specialize vp9_convolve8_avg_vert ssse3

 #
 # dct
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@ -79,6 +79,48 @@ void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
                                   unsigned int output_height,
                                   const short *filter);

+void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
+                                       const unsigned int src_pitch,
+                                       unsigned char *output_ptr,
+                                       unsigned int out_pitch,
+                                       unsigned int output_height,
+                                       const short *filter);
+
+void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
+                                       const unsigned int src_pitch,
+                                       unsigned char *output_ptr,
+                                       unsigned int out_pitch,
+                                       unsigned int output_height,
+                                       const short *filter);
+
+void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const int16_t *filter_x, int x_step_q4,
@ -155,6 +197,82 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
  }
 }

+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+  }
+}
+
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+  }
+}
+
 void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                         uint8_t *dst, int dst_stride,
                         const int16_t *filter_x, int x_step_q4,
@ -200,4 +318,50 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                  filter_x, x_step_q4, filter_y, y_step_q4,
                  w, h);
 }
+
+void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+  // check w/h due to fixed size fdata2 array
+  assert(w <= 16);
+  assert(h <= 16);
+
+  if (x_step_q4 == 16 && y_step_q4 == 16 &&
+      filter_x[3] != 128 && filter_y[3] != 128) {
+    if (w == 16) {
+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+                                    fdata2, 16,
+                                    h + 7, filter_x);
+      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
+                                        dst, dst_stride,
+                                        h, filter_y);
+      return;
+    }
+    if (w == 8) {
+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
+                                       dst, dst_stride,
+                                       h, filter_y);
+      return;
+    }
+    if (w == 4) {
+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
+                                       dst, dst_stride,
+                                       h, filter_y);
+      return;
+    }
+  }
+  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h);
+}
 #endif
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm