Code refactor on InterpKernel
It in essence refactors the code for both the interpolation filtering and the convolution. This change includes the moving of all the files as well as the changing of the code from vp9_ prefix to vpx_ prefix accordingly, for underneath architectures: (1) x86; (2) arm/neon; and (3) mips/msa. The work on mips/drsp2 will be done in a separate change list. Change-Id: Ic3ce7fb7f81210db7628b373c73553db68793c46
This commit is contained in:
		| @@ -14,12 +14,15 @@ | |||||||
|  |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "./vp9_rtcd.h" | #include "./vp9_rtcd.h" | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "test/acm_random.h" | #include "test/acm_random.h" | ||||||
| #include "test/clear_system_state.h" | #include "test/clear_system_state.h" | ||||||
| #include "test/register_state_check.h" | #include "test/register_state_check.h" | ||||||
| #include "test/util.h" | #include "test/util.h" | ||||||
| #include "vp9/common/vp9_common.h" | #include "vp9/common/vp9_common.h" | ||||||
| #include "vp9/common/vp9_filter.h" | #include "vp9/common/vp9_filter.h" | ||||||
|  | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
| #include "vpx_mem/vpx_mem.h" | #include "vpx_mem/vpx_mem.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
|  |  | ||||||
| @@ -945,7 +948,7 @@ void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                  const int16_t *filter_y, |                                  const int16_t *filter_y, | ||||||
|                                  int filter_y_stride, |                                  int filter_y_stride, | ||||||
|                                  int w, int h) { |                                  int w, int h) { | ||||||
|   vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, |   vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, | ||||||
|                                   filter_x_stride, filter_y, filter_y_stride, |                                   filter_x_stride, filter_y, filter_y_stride, | ||||||
|                                   w, h, 8); |                                   w, h, 8); | ||||||
| } | } | ||||||
| @@ -957,7 +960,7 @@ void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                      const int16_t *filter_y, |                                      const int16_t *filter_y, | ||||||
|                                      int filter_y_stride, |                                      int filter_y_stride, | ||||||
|                                      int w, int h) { |                                      int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                       filter_x, filter_x_stride, |                                       filter_x, filter_x_stride, | ||||||
|                                       filter_y, filter_y_stride, w, h, 8); |                                       filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -969,7 +972,7 @@ void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                 const int16_t *filter_y, |                                 const int16_t *filter_y, | ||||||
|                                 int filter_y_stride, |                                 int filter_y_stride, | ||||||
|                                 int w, int h) { |                                 int w, int h) { | ||||||
|   vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                  filter_x, filter_x_stride, |                                  filter_x, filter_x_stride, | ||||||
|                                  filter_y, filter_y_stride, w, h, 8); |                                  filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -981,7 +984,7 @@ void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                     const int16_t *filter_y, |                                     const int16_t *filter_y, | ||||||
|                                     int filter_y_stride, |                                     int filter_y_stride, | ||||||
|                                     int w, int h) { |                                     int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                      filter_x, filter_x_stride, |                                      filter_x, filter_x_stride, | ||||||
|                                      filter_y, filter_y_stride, w, h, 8); |                                      filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -993,7 +996,7 @@ void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                            const int16_t *filter_y, |                            const int16_t *filter_y, | ||||||
|                            int filter_y_stride, |                            int filter_y_stride, | ||||||
|                            int w, int h) { |                            int w, int h) { | ||||||
|   vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, filter_x_stride, |                             filter_x, filter_x_stride, | ||||||
|                             filter_y, filter_y_stride, w, h, 8); |                             filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1005,7 +1008,7 @@ void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                const int16_t *filter_y, |                                const int16_t *filter_y, | ||||||
|                                int filter_y_stride, |                                int filter_y_stride, | ||||||
|                                int w, int h) { |                                int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                 filter_x, filter_x_stride, |                                 filter_x, filter_x_stride, | ||||||
|                                 filter_y, filter_y_stride, w, h, 8); |                                 filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1017,7 +1020,7 @@ void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   const int16_t *filter_y, |                                   const int16_t *filter_y, | ||||||
|                                   int filter_y_stride, |                                   int filter_y_stride, | ||||||
|                                   int w, int h) { |                                   int w, int h) { | ||||||
|   vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, filter_x_stride, |                                   filter_x, filter_x_stride, | ||||||
|                                   filter_y, filter_y_stride, w, h, 10); |                                   filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1029,7 +1032,7 @@ void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                       const int16_t *filter_y, |                                       const int16_t *filter_y, | ||||||
|                                       int filter_y_stride, |                                       int filter_y_stride, | ||||||
|                                       int w, int h) { |                                       int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                       filter_x, filter_x_stride, |                                       filter_x, filter_x_stride, | ||||||
|                                       filter_y, filter_y_stride, w, h, 10); |                                       filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1041,7 +1044,7 @@ void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                  const int16_t *filter_y, |                                  const int16_t *filter_y, | ||||||
|                                  int filter_y_stride, |                                  int filter_y_stride, | ||||||
|                                  int w, int h) { |                                  int w, int h) { | ||||||
|   vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                  filter_x, filter_x_stride, |                                  filter_x, filter_x_stride, | ||||||
|                                  filter_y, filter_y_stride, w, h, 10); |                                  filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1053,7 +1056,7 @@ void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                      const int16_t *filter_y, |                                      const int16_t *filter_y, | ||||||
|                                      int filter_y_stride, |                                      int filter_y_stride, | ||||||
|                                      int w, int h) { |                                      int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                      filter_x, filter_x_stride, |                                      filter_x, filter_x_stride, | ||||||
|                                      filter_y, filter_y_stride, w, h, 10); |                                      filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1065,7 +1068,7 @@ void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             const int16_t *filter_y, |                             const int16_t *filter_y, | ||||||
|                             int filter_y_stride, |                             int filter_y_stride, | ||||||
|                             int w, int h) { |                             int w, int h) { | ||||||
|   vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, filter_x_stride, |                             filter_x, filter_x_stride, | ||||||
|                             filter_y, filter_y_stride, w, h, 10); |                             filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1077,7 +1080,7 @@ void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                 const int16_t *filter_y, |                                 const int16_t *filter_y, | ||||||
|                                 int filter_y_stride, |                                 int filter_y_stride, | ||||||
|                                 int w, int h) { |                                 int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                 filter_x, filter_x_stride, |                                 filter_x, filter_x_stride, | ||||||
|                                 filter_y, filter_y_stride, w, h, 10); |                                 filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1089,7 +1092,7 @@ void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   const int16_t *filter_y, |                                   const int16_t *filter_y, | ||||||
|                                   int filter_y_stride, |                                   int filter_y_stride, | ||||||
|                                   int w, int h) { |                                   int w, int h) { | ||||||
|   vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, filter_x_stride, |                                   filter_x, filter_x_stride, | ||||||
|                                   filter_y, filter_y_stride, w, h, 12); |                                   filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1101,7 +1104,7 @@ void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                       const int16_t *filter_y, |                                       const int16_t *filter_y, | ||||||
|                                       int filter_y_stride, |                                       int filter_y_stride, | ||||||
|                                       int w, int h) { |                                       int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                       filter_x, filter_x_stride, |                                       filter_x, filter_x_stride, | ||||||
|                                       filter_y, filter_y_stride, w, h, 12); |                                       filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1113,7 +1116,7 @@ void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                  const int16_t *filter_y, |                                  const int16_t *filter_y, | ||||||
|                                  int filter_y_stride, |                                  int filter_y_stride, | ||||||
|                                  int w, int h) { |                                  int w, int h) { | ||||||
|   vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                  filter_x, filter_x_stride, |                                  filter_x, filter_x_stride, | ||||||
|                                  filter_y, filter_y_stride, w, h, 12); |                                  filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1125,7 +1128,7 @@ void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                      const int16_t *filter_y, |                                      const int16_t *filter_y, | ||||||
|                                      int filter_y_stride, |                                      int filter_y_stride, | ||||||
|                                      int w, int h) { |                                      int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                      filter_x, filter_x_stride, |                                      filter_x, filter_x_stride, | ||||||
|                                      filter_y, filter_y_stride, w, h, 12); |                                      filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1137,7 +1140,7 @@ void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             const int16_t *filter_y, |                             const int16_t *filter_y, | ||||||
|                             int filter_y_stride, |                             int filter_y_stride, | ||||||
|                             int w, int h) { |                             int w, int h) { | ||||||
|   vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, filter_x_stride, |                             filter_x, filter_x_stride, | ||||||
|                             filter_y, filter_y_stride, w, h, 12); |                             filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1149,7 +1152,7 @@ void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                 const int16_t *filter_y, |                                 const int16_t *filter_y, | ||||||
|                                 int filter_y_stride, |                                 int filter_y_stride, | ||||||
|                                 int w, int h) { |                                 int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, | ||||||
|                                 filter_x, filter_x_stride, |                                 filter_x, filter_x_stride, | ||||||
|                                 filter_y, filter_y_stride, w, h, 12); |                                 filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1162,7 +1165,7 @@ void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             const int16_t *filter_y, |                             const int16_t *filter_y, | ||||||
|                             int filter_y_stride, |                             int filter_y_stride, | ||||||
|                             int w, int h) { |                             int w, int h) { | ||||||
|   vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, filter_x_stride, |                              filter_x, filter_x_stride, | ||||||
|                              filter_y, filter_y_stride, w, h, 8); |                              filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1174,7 +1177,7 @@ void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                            const int16_t *filter_y, |                            const int16_t *filter_y, | ||||||
|                            int filter_y_stride, |                            int filter_y_stride, | ||||||
|                            int w, int h) { |                            int w, int h) { | ||||||
|   vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, filter_x_stride, |                             filter_x, filter_x_stride, | ||||||
|                             filter_y, filter_y_stride, w, h, 8); |                             filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1186,7 +1189,7 @@ void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                               const int16_t *filter_y, |                               const int16_t *filter_y, | ||||||
|                               int filter_y_stride, |                               int filter_y_stride, | ||||||
|                               int w, int h) { |                               int w, int h) { | ||||||
|   vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                filter_x, filter_x_stride, |                                filter_x, filter_x_stride, | ||||||
|                                filter_y, filter_y_stride, w, h, 8); |                                filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1198,7 +1201,7 @@ void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   const int16_t *filter_y, |                                   const int16_t *filter_y, | ||||||
|                                   int filter_y_stride, |                                   int filter_y_stride, | ||||||
|                                   int w, int h) { |                                   int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                    filter_x, filter_x_stride, |                                    filter_x, filter_x_stride, | ||||||
|                                    filter_y, filter_y_stride, w, h, 8); |                                    filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1210,7 +1213,7 @@ void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              const int16_t *filter_y, |                              const int16_t *filter_y, | ||||||
|                              int filter_y_stride, |                              int filter_y_stride, | ||||||
|                              int w, int h) { |                              int w, int h) { | ||||||
|   vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                               filter_x, filter_x_stride, |                               filter_x, filter_x_stride, | ||||||
|                               filter_y, filter_y_stride, w, h, 8); |                               filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1222,7 +1225,7 @@ void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                  const int16_t *filter_y, |                                  const int16_t *filter_y, | ||||||
|                                  int filter_y_stride, |                                  int filter_y_stride, | ||||||
|                                  int w, int h) { |                                  int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, filter_x_stride, |                                   filter_x, filter_x_stride, | ||||||
|                                   filter_y, filter_y_stride, w, h, 8); |                                   filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1234,7 +1237,7 @@ void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                         const int16_t *filter_y, |                         const int16_t *filter_y, | ||||||
|                         int filter_y_stride, |                         int filter_y_stride, | ||||||
|                         int w, int h) { |                         int w, int h) { | ||||||
|   vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                          filter_x, filter_x_stride, |                          filter_x, filter_x_stride, | ||||||
|                          filter_y, filter_y_stride, w, h, 8); |                          filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1246,7 +1249,7 @@ void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             const int16_t *filter_y, |                             const int16_t *filter_y, | ||||||
|                             int filter_y_stride, |                             int filter_y_stride, | ||||||
|                             int w, int h) { |                             int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, filter_x_stride, |                              filter_x, filter_x_stride, | ||||||
|                              filter_y, filter_y_stride, w, h, 8); |                              filter_y, filter_y_stride, w, h, 8); | ||||||
| } | } | ||||||
| @@ -1258,7 +1261,7 @@ void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              const int16_t *filter_y, |                              const int16_t *filter_y, | ||||||
|                              int filter_y_stride, |                              int filter_y_stride, | ||||||
|                              int w, int h) { |                              int w, int h) { | ||||||
|   vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, filter_x_stride, |                              filter_x, filter_x_stride, | ||||||
|                              filter_y, filter_y_stride, w, h, 10); |                              filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1270,7 +1273,7 @@ void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             const int16_t *filter_y, |                             const int16_t *filter_y, | ||||||
|                             int filter_y_stride, |                             int filter_y_stride, | ||||||
|                             int w, int h) { |                             int w, int h) { | ||||||
|   vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, filter_x_stride, |                             filter_x, filter_x_stride, | ||||||
|                             filter_y, filter_y_stride, w, h, 10); |                             filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1282,7 +1285,7 @@ void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                const int16_t *filter_y, |                                const int16_t *filter_y, | ||||||
|                                int filter_y_stride, |                                int filter_y_stride, | ||||||
|                                int w, int h) { |                                int w, int h) { | ||||||
|   vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                filter_x, filter_x_stride, |                                filter_x, filter_x_stride, | ||||||
|                                filter_y, filter_y_stride, w, h, 10); |                                filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1294,7 +1297,7 @@ void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                    const int16_t *filter_y, |                                    const int16_t *filter_y, | ||||||
|                                    int filter_y_stride, |                                    int filter_y_stride, | ||||||
|                                    int w, int h) { |                                    int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                    filter_x, filter_x_stride, |                                    filter_x, filter_x_stride, | ||||||
|                                    filter_y, filter_y_stride, w, h, 10); |                                    filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1306,7 +1309,7 @@ void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                               const int16_t *filter_y, |                               const int16_t *filter_y, | ||||||
|                               int filter_y_stride, |                               int filter_y_stride, | ||||||
|                               int w, int h) { |                               int w, int h) { | ||||||
|   vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                               filter_x, filter_x_stride, |                               filter_x, filter_x_stride, | ||||||
|                               filter_y, filter_y_stride, w, h, 10); |                               filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1318,7 +1321,7 @@ void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   const int16_t *filter_y, |                                   const int16_t *filter_y, | ||||||
|                                   int filter_y_stride, |                                   int filter_y_stride, | ||||||
|                                   int w, int h) { |                                   int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, filter_x_stride, |                                   filter_x, filter_x_stride, | ||||||
|                                   filter_y, filter_y_stride, w, h, 10); |                                   filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1330,7 +1333,7 @@ void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                          const int16_t *filter_y, |                          const int16_t *filter_y, | ||||||
|                          int filter_y_stride, |                          int filter_y_stride, | ||||||
|                          int w, int h) { |                          int w, int h) { | ||||||
|   vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                          filter_x, filter_x_stride, |                          filter_x, filter_x_stride, | ||||||
|                          filter_y, filter_y_stride, w, h, 10); |                          filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1342,7 +1345,7 @@ void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              const int16_t *filter_y, |                              const int16_t *filter_y, | ||||||
|                              int filter_y_stride, |                              int filter_y_stride, | ||||||
|                              int w, int h) { |                              int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, filter_x_stride, |                              filter_x, filter_x_stride, | ||||||
|                              filter_y, filter_y_stride, w, h, 10); |                              filter_y, filter_y_stride, w, h, 10); | ||||||
| } | } | ||||||
| @@ -1354,7 +1357,7 @@ void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              const int16_t *filter_y, |                              const int16_t *filter_y, | ||||||
|                              int filter_y_stride, |                              int filter_y_stride, | ||||||
|                              int w, int h) { |                              int w, int h) { | ||||||
|   vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, filter_x_stride, |                              filter_x, filter_x_stride, | ||||||
|                              filter_y, filter_y_stride, w, h, 12); |                              filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1366,7 +1369,7 @@ void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             const int16_t *filter_y, |                             const int16_t *filter_y, | ||||||
|                             int filter_y_stride, |                             int filter_y_stride, | ||||||
|                             int w, int h) { |                             int w, int h) { | ||||||
|   vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, filter_x_stride, |                             filter_x, filter_x_stride, | ||||||
|                             filter_y, filter_y_stride, w, h, 12); |                             filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1378,7 +1381,7 @@ void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                const int16_t *filter_y, |                                const int16_t *filter_y, | ||||||
|                                int filter_y_stride, |                                int filter_y_stride, | ||||||
|                                int w, int h) { |                                int w, int h) { | ||||||
|   vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                filter_x, filter_x_stride, |                                filter_x, filter_x_stride, | ||||||
|                                filter_y, filter_y_stride, w, h, 12); |                                filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1390,7 +1393,7 @@ void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                    const int16_t *filter_y, |                                    const int16_t *filter_y, | ||||||
|                                    int filter_y_stride, |                                    int filter_y_stride, | ||||||
|                                    int w, int h) { |                                    int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                    filter_x, filter_x_stride, |                                    filter_x, filter_x_stride, | ||||||
|                                    filter_y, filter_y_stride, w, h, 12); |                                    filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1402,7 +1405,7 @@ void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                               const int16_t *filter_y, |                               const int16_t *filter_y, | ||||||
|                               int filter_y_stride, |                               int filter_y_stride, | ||||||
|                               int w, int h) { |                               int w, int h) { | ||||||
|   vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                               filter_x, filter_x_stride, |                               filter_x, filter_x_stride, | ||||||
|                               filter_y, filter_y_stride, w, h, 12); |                               filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1414,7 +1417,7 @@ void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   const int16_t *filter_y, |                                   const int16_t *filter_y, | ||||||
|                                   int filter_y_stride, |                                   int filter_y_stride, | ||||||
|                                   int w, int h) { |                                   int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, filter_x_stride, |                                   filter_x, filter_x_stride, | ||||||
|                                   filter_y, filter_y_stride, w, h, 12); |                                   filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1426,7 +1429,7 @@ void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                          const int16_t *filter_y, |                          const int16_t *filter_y, | ||||||
|                          int filter_y_stride, |                          int filter_y_stride, | ||||||
|                          int w, int h) { |                          int w, int h) { | ||||||
|   vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                          filter_x, filter_x_stride, |                          filter_x, filter_x_stride, | ||||||
|                          filter_y, filter_y_stride, w, h, 12); |                          filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1438,7 +1441,7 @@ void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              const int16_t *filter_y, |                              const int16_t *filter_y, | ||||||
|                              int filter_y_stride, |                              int filter_y_stride, | ||||||
|                              int w, int h) { |                              int w, int h) { | ||||||
|   vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, |   vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, filter_x_stride, |                              filter_x, filter_x_stride, | ||||||
|                              filter_y, filter_y_stride, w, h, 12); |                              filter_y, filter_y_stride, w, h, 12); | ||||||
| } | } | ||||||
| @@ -1504,10 +1507,10 @@ INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values( | |||||||
| #else | #else | ||||||
|  |  | ||||||
| const ConvolveFunctions convolve8_c( | const ConvolveFunctions convolve8_c( | ||||||
|     vp9_convolve_copy_c, vp9_convolve_avg_c, |     vpx_convolve_copy_c, vpx_convolve_avg_c, | ||||||
|     vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c, |     vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c, | ||||||
|     vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c, |     vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c, | ||||||
|     vp9_convolve8_c, vp9_convolve8_avg_c, 0); |     vpx_convolve8_c, vpx_convolve8_avg_c, 0); | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( | ||||||
|     make_tuple(4, 4, &convolve8_c), |     make_tuple(4, 4, &convolve8_c), | ||||||
| @@ -1585,13 +1588,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( | |||||||
| #else | #else | ||||||
| const ConvolveFunctions convolve8_sse2( | const ConvolveFunctions convolve8_sse2( | ||||||
| #if CONFIG_USE_X86INC | #if CONFIG_USE_X86INC | ||||||
|     vp9_convolve_copy_sse2, vp9_convolve_avg_sse2, |     vpx_convolve_copy_sse2, vpx_convolve_avg_sse2, | ||||||
| #else | #else | ||||||
|     vp9_convolve_copy_c, vp9_convolve_avg_c, |     vpx_convolve_copy_c, vpx_convolve_avg_c, | ||||||
| #endif  // CONFIG_USE_X86INC | #endif  // CONFIG_USE_X86INC | ||||||
|     vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2, |     vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2, | ||||||
|     vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2, |     vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2, | ||||||
|     vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0); |     vpx_convolve8_sse2, vpx_convolve8_avg_sse2, 0); | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( | ||||||
|     make_tuple(4, 4, &convolve8_sse2), |     make_tuple(4, 4, &convolve8_sse2), | ||||||
| @@ -1612,10 +1615,10 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( | |||||||
|  |  | ||||||
| #if HAVE_SSSE3 | #if HAVE_SSSE3 | ||||||
| const ConvolveFunctions convolve8_ssse3( | const ConvolveFunctions convolve8_ssse3( | ||||||
|     vp9_convolve_copy_c, vp9_convolve_avg_c, |     vpx_convolve_copy_c, vpx_convolve_avg_c, | ||||||
|     vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3, |     vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3, | ||||||
|     vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3, |     vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3, | ||||||
|     vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0); |     vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, 0); | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( | ||||||
|     make_tuple(4, 4, &convolve8_ssse3), |     make_tuple(4, 4, &convolve8_ssse3), | ||||||
| @@ -1635,10 +1638,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( | |||||||
|  |  | ||||||
| #if HAVE_AVX2 && HAVE_SSSE3 | #if HAVE_AVX2 && HAVE_SSSE3 | ||||||
| const ConvolveFunctions convolve8_avx2( | const ConvolveFunctions convolve8_avx2( | ||||||
|     vp9_convolve_copy_c, vp9_convolve_avg_c, |     vpx_convolve_copy_c, vpx_convolve_avg_c, | ||||||
|     vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3, |     vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3, | ||||||
|     vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3, |     vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3, | ||||||
|     vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0); |     vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, 0); | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( | ||||||
|     make_tuple(4, 4, &convolve8_avx2), |     make_tuple(4, 4, &convolve8_avx2), | ||||||
| @@ -1659,16 +1662,16 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( | |||||||
| #if HAVE_NEON | #if HAVE_NEON | ||||||
| #if HAVE_NEON_ASM | #if HAVE_NEON_ASM | ||||||
| const ConvolveFunctions convolve8_neon( | const ConvolveFunctions convolve8_neon( | ||||||
|     vp9_convolve_copy_neon, vp9_convolve_avg_neon, |     vpx_convolve_copy_neon, vpx_convolve_avg_neon, | ||||||
|     vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, |     vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon, | ||||||
|     vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, |     vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, | ||||||
|     vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); |     vpx_convolve8_neon, vpx_convolve8_avg_neon, 0); | ||||||
| #else  // HAVE_NEON | #else  // HAVE_NEON | ||||||
| const ConvolveFunctions convolve8_neon( | const ConvolveFunctions convolve8_neon( | ||||||
|     vp9_convolve_copy_neon, vp9_convolve_avg_neon, |     vpx_convolve_copy_neon, vpx_convolve_avg_neon, | ||||||
|     vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, |     vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon, | ||||||
|     vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, |     vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, | ||||||
|     vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); |     vpx_convolve8_neon, vpx_convolve8_avg_neon, 0); | ||||||
| #endif  // HAVE_NEON_ASM | #endif  // HAVE_NEON_ASM | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( | ||||||
| @@ -1689,10 +1692,10 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( | |||||||
|  |  | ||||||
| #if HAVE_DSPR2 | #if HAVE_DSPR2 | ||||||
| const ConvolveFunctions convolve8_dspr2( | const ConvolveFunctions convolve8_dspr2( | ||||||
|     vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2, |     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, | ||||||
|     vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2, |     vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2, | ||||||
|     vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2, |     vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2, | ||||||
|     vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0); |     vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2, 0); | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( | ||||||
|     make_tuple(4, 4, &convolve8_dspr2), |     make_tuple(4, 4, &convolve8_dspr2), | ||||||
| @@ -1712,10 +1715,10 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( | |||||||
|  |  | ||||||
| #if HAVE_MSA | #if HAVE_MSA | ||||||
| const ConvolveFunctions convolve8_msa( | const ConvolveFunctions convolve8_msa( | ||||||
|     vp9_convolve_copy_msa, vp9_convolve_avg_msa, |     vpx_convolve_copy_msa, vpx_convolve_avg_msa, | ||||||
|     vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_msa, |     vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa, | ||||||
|     vp9_convolve8_vert_msa, vp9_convolve8_avg_vert_msa, |     vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa, | ||||||
|     vp9_convolve8_msa, vp9_convolve8_avg_msa, 0); |     vpx_convolve8_msa, vpx_convolve8_avg_msa, 0); | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values( | INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values( | ||||||
|     make_tuple(4, 4, &convolve8_msa), |     make_tuple(4, 4, &convolve8_msa), | ||||||
|   | |||||||
| @@ -1,390 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <stddef.h> |  | ||||||
| #include <arm_neon.h> |  | ||||||
|  |  | ||||||
| #include "./vpx_config.h" |  | ||||||
| #include "vpx_ports/mem.h" |  | ||||||
|  |  | ||||||
| void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, |  | ||||||
|                                uint8_t *dst, ptrdiff_t dst_stride, |  | ||||||
|                                const int16_t *filter_x, int x_step_q4, |  | ||||||
|                                const int16_t *filter_y, int y_step_q4, |  | ||||||
|                                int w, int h); |  | ||||||
| void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, |  | ||||||
|                                uint8_t *dst, ptrdiff_t dst_stride, |  | ||||||
|                                const int16_t *filter_x, int x_step_q4, |  | ||||||
|                                const int16_t *filter_y, int y_step_q4, |  | ||||||
|                                int w, int h); |  | ||||||
|  |  | ||||||
| static INLINE int32x4_t MULTIPLY_BY_Q0( |  | ||||||
|         int16x4_t dsrc0, |  | ||||||
|         int16x4_t dsrc1, |  | ||||||
|         int16x4_t dsrc2, |  | ||||||
|         int16x4_t dsrc3, |  | ||||||
|         int16x4_t dsrc4, |  | ||||||
|         int16x4_t dsrc5, |  | ||||||
|         int16x4_t dsrc6, |  | ||||||
|         int16x4_t dsrc7, |  | ||||||
|         int16x8_t q0s16) { |  | ||||||
|     int32x4_t qdst; |  | ||||||
|     int16x4_t d0s16, d1s16; |  | ||||||
|  |  | ||||||
|     d0s16 = vget_low_s16(q0s16); |  | ||||||
|     d1s16 = vget_high_s16(q0s16); |  | ||||||
|  |  | ||||||
|     qdst = vmull_lane_s16(dsrc0, d0s16, 0); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); |  | ||||||
|     return qdst; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp9_convolve8_avg_horiz_neon( |  | ||||||
|         uint8_t *src, |  | ||||||
|         ptrdiff_t src_stride, |  | ||||||
|         uint8_t *dst, |  | ||||||
|         ptrdiff_t dst_stride, |  | ||||||
|         const int16_t *filter_x, |  | ||||||
|         int x_step_q4, |  | ||||||
|         const int16_t *filter_y,  // unused |  | ||||||
|         int y_step_q4,            // unused |  | ||||||
|         int w, |  | ||||||
|         int h) { |  | ||||||
|     int width; |  | ||||||
|     uint8_t *s, *d; |  | ||||||
|     uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; |  | ||||||
|     uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; |  | ||||||
|     uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; |  | ||||||
|     int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; |  | ||||||
|     int16x4_t d24s16, d25s16, d26s16, d27s16; |  | ||||||
|     uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; |  | ||||||
|     int16x8_t q0s16; |  | ||||||
|     uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |  | ||||||
|     int32x4_t q1s32, q2s32, q14s32, q15s32; |  | ||||||
|     uint16x8x2_t q0x2u16; |  | ||||||
|     uint8x8x2_t d0x2u8, d1x2u8; |  | ||||||
|     uint32x2x2_t d0x2u32; |  | ||||||
|     uint16x4x2_t d0x2u16, d1x2u16; |  | ||||||
|     uint32x4x2_t q0x2u32; |  | ||||||
|  |  | ||||||
|     if (x_step_q4 != 16) { |  | ||||||
|         vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |  | ||||||
|                                   filter_x, x_step_q4, |  | ||||||
|                                   filter_y, y_step_q4, w, h); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     q0s16 = vld1q_s16(filter_x); |  | ||||||
|  |  | ||||||
|     src -= 3;  // adjust for taps |  | ||||||
|     for (; h > 0; h -= 4) {  // loop_horiz_v |  | ||||||
|         s = src; |  | ||||||
|         d24u8 = vld1_u8(s); |  | ||||||
|         s += src_stride; |  | ||||||
|         d25u8 = vld1_u8(s); |  | ||||||
|         s += src_stride; |  | ||||||
|         d26u8 = vld1_u8(s); |  | ||||||
|         s += src_stride; |  | ||||||
|         d27u8 = vld1_u8(s); |  | ||||||
|  |  | ||||||
|         q12u8 = vcombine_u8(d24u8, d25u8); |  | ||||||
|         q13u8 = vcombine_u8(d26u8, d27u8); |  | ||||||
|  |  | ||||||
|         q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), |  | ||||||
|                             vreinterpretq_u16_u8(q13u8)); |  | ||||||
|         d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); |  | ||||||
|         d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); |  | ||||||
|         d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); |  | ||||||
|         d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); |  | ||||||
|         d0x2u8 = vtrn_u8(d24u8, d25u8); |  | ||||||
|         d1x2u8 = vtrn_u8(d26u8, d27u8); |  | ||||||
|  |  | ||||||
|         __builtin_prefetch(src + src_stride * 4); |  | ||||||
|         __builtin_prefetch(src + src_stride * 5); |  | ||||||
|  |  | ||||||
|         q8u16 = vmovl_u8(d0x2u8.val[0]); |  | ||||||
|         q9u16 = vmovl_u8(d0x2u8.val[1]); |  | ||||||
|         q10u16 = vmovl_u8(d1x2u8.val[0]); |  | ||||||
|         q11u16 = vmovl_u8(d1x2u8.val[1]); |  | ||||||
|  |  | ||||||
|         src += 7; |  | ||||||
|         d16u16 = vget_low_u16(q8u16); |  | ||||||
|         d17u16 = vget_high_u16(q8u16); |  | ||||||
|         d18u16 = vget_low_u16(q9u16); |  | ||||||
|         d19u16 = vget_high_u16(q9u16); |  | ||||||
|         q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18 |  | ||||||
|         q9u16 = vcombine_u16(d17u16, d19u16); |  | ||||||
|  |  | ||||||
|         d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); |  | ||||||
|         d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21 |  | ||||||
|         for (width = w; |  | ||||||
|              width > 0; |  | ||||||
|              width -= 4, src += 4, dst += 4) {  // loop_horiz |  | ||||||
|             s = src; |  | ||||||
|             d28u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|             s += src_stride; |  | ||||||
|             d29u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|             s += src_stride; |  | ||||||
|             d31u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|             s += src_stride; |  | ||||||
|             d30u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(src + 64); |  | ||||||
|  |  | ||||||
|             d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), |  | ||||||
|                                vreinterpret_u16_u32(d31u32)); |  | ||||||
|             d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), |  | ||||||
|                                vreinterpret_u16_u32(d30u32)); |  | ||||||
|             d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28 |  | ||||||
|                              vreinterpret_u8_u16(d1x2u16.val[0]));  // d29 |  | ||||||
|             d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31 |  | ||||||
|                              vreinterpret_u8_u16(d1x2u16.val[1]));  // d30 |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(src + 64 + src_stride); |  | ||||||
|  |  | ||||||
|             q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); |  | ||||||
|             q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); |  | ||||||
|             q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), |  | ||||||
|                                 vreinterpretq_u32_u8(q15u8)); |  | ||||||
|  |  | ||||||
|             d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); |  | ||||||
|             d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); |  | ||||||
|             q12u16 = vmovl_u8(d28u8); |  | ||||||
|             q13u16 = vmovl_u8(d29u8); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(src + 64 + src_stride * 2); |  | ||||||
|  |  | ||||||
|             d = dst; |  | ||||||
|             d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); |  | ||||||
|  |  | ||||||
|             d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); |  | ||||||
|             d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); |  | ||||||
|             d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); |  | ||||||
|             d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); |  | ||||||
|             d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |  | ||||||
|             d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |  | ||||||
|             d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |  | ||||||
|             d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); |  | ||||||
|             d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); |  | ||||||
|  |  | ||||||
|             q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, |  | ||||||
|                                     d18s16, d19s16, d23s16, d24s16, q0s16); |  | ||||||
|             q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, |  | ||||||
|                                     d19s16, d23s16, d24s16, d26s16, q0s16); |  | ||||||
|             q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, |  | ||||||
|                                     d23s16, d24s16, d26s16, d27s16, q0s16); |  | ||||||
|             q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, |  | ||||||
|                                     d24s16, d26s16, d27s16, d25s16, q0s16); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(src + 64 + src_stride * 3); |  | ||||||
|  |  | ||||||
|             d2u16 = vqrshrun_n_s32(q1s32, 7); |  | ||||||
|             d3u16 = vqrshrun_n_s32(q2s32, 7); |  | ||||||
|             d4u16 = vqrshrun_n_s32(q14s32, 7); |  | ||||||
|             d5u16 = vqrshrun_n_s32(q15s32, 7); |  | ||||||
|  |  | ||||||
|             q1u16 = vcombine_u16(d2u16, d3u16); |  | ||||||
|             q2u16 = vcombine_u16(d4u16, d5u16); |  | ||||||
|  |  | ||||||
|             d2u8 = vqmovn_u16(q1u16); |  | ||||||
|             d3u8 = vqmovn_u16(q2u16); |  | ||||||
|  |  | ||||||
|             d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), |  | ||||||
|                                vreinterpret_u16_u8(d3u8)); |  | ||||||
|             d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), |  | ||||||
|                                vreinterpret_u32_u16(d0x2u16.val[1])); |  | ||||||
|             d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), |  | ||||||
|                              vreinterpret_u8_u32(d0x2u32.val[1])); |  | ||||||
|  |  | ||||||
|             q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); |  | ||||||
|             q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); |  | ||||||
|  |  | ||||||
|             q1u8 = vrhaddq_u8(q1u8, q3u8); |  | ||||||
|  |  | ||||||
|             d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); |  | ||||||
|             d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); |  | ||||||
|  |  | ||||||
|             d = dst; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 1); |  | ||||||
|  |  | ||||||
|             q8u16 = q9u16; |  | ||||||
|             d20s16 = d23s16; |  | ||||||
|             q11u16 = q12u16; |  | ||||||
|             q9u16 = q13u16; |  | ||||||
|             d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); |  | ||||||
|         } |  | ||||||
|         src += src_stride * 4 - w - 7; |  | ||||||
|         dst += dst_stride * 4 - w; |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp9_convolve8_avg_vert_neon( |  | ||||||
|         uint8_t *src, |  | ||||||
|         ptrdiff_t src_stride, |  | ||||||
|         uint8_t *dst, |  | ||||||
|         ptrdiff_t dst_stride, |  | ||||||
|         const int16_t *filter_x,  // unused |  | ||||||
|         int x_step_q4,            // unused |  | ||||||
|         const int16_t *filter_y, |  | ||||||
|         int y_step_q4, |  | ||||||
|         int w, |  | ||||||
|         int h) { |  | ||||||
|     int height; |  | ||||||
|     uint8_t *s, *d; |  | ||||||
|     uint8x8_t d2u8, d3u8; |  | ||||||
|     uint32x2_t d2u32, d3u32, d6u32, d7u32; |  | ||||||
|     uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; |  | ||||||
|     uint8x16_t q1u8, q3u8; |  | ||||||
|     int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; |  | ||||||
|     int16x4_t d24s16, d25s16, d26s16, d27s16; |  | ||||||
|     uint16x4_t d2u16, d3u16, d4u16, d5u16; |  | ||||||
|     int16x8_t q0s16; |  | ||||||
|     uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |  | ||||||
|     int32x4_t q1s32, q2s32, q14s32, q15s32; |  | ||||||
|  |  | ||||||
|     if (y_step_q4 != 16) { |  | ||||||
|         vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |  | ||||||
|                                  filter_x, x_step_q4, |  | ||||||
|                                  filter_y, y_step_q4, w, h); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     src -= src_stride * 3; |  | ||||||
|     q0s16 = vld1q_s16(filter_y); |  | ||||||
|     for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h |  | ||||||
|         s = src; |  | ||||||
|         d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|         d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); |  | ||||||
|         s += src_stride; |  | ||||||
|         d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|         d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); |  | ||||||
|         s += src_stride; |  | ||||||
|         d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|         d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); |  | ||||||
|         s += src_stride; |  | ||||||
|         d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|  |  | ||||||
|         q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32)); |  | ||||||
|         q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32)); |  | ||||||
|         q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); |  | ||||||
|         q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); |  | ||||||
|  |  | ||||||
|         d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); |  | ||||||
|         d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); |  | ||||||
|         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |  | ||||||
|         d = dst; |  | ||||||
|         for (height = h; height > 0; height -= 4) {  // loop_vert |  | ||||||
|             d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); |  | ||||||
|             s += src_stride; |  | ||||||
|             d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); |  | ||||||
|             s += src_stride; |  | ||||||
|             d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); |  | ||||||
|             s += src_stride; |  | ||||||
|             d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); |  | ||||||
|             s += src_stride; |  | ||||||
|  |  | ||||||
|             q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); |  | ||||||
|             q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); |  | ||||||
|  |  | ||||||
|             d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); |  | ||||||
|             d -= dst_stride * 3; |  | ||||||
|  |  | ||||||
|             d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); |  | ||||||
|             d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); |  | ||||||
|             d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); |  | ||||||
|             d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); |  | ||||||
|             d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |  | ||||||
|             d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |  | ||||||
|             d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); |  | ||||||
|             d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(s); |  | ||||||
|             __builtin_prefetch(s + src_stride); |  | ||||||
|             q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, |  | ||||||
|                                     d20s16, d21s16, d22s16, d24s16, q0s16); |  | ||||||
|             __builtin_prefetch(s + src_stride * 2); |  | ||||||
|             __builtin_prefetch(s + src_stride * 3); |  | ||||||
|             q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, |  | ||||||
|                                     d21s16, d22s16, d24s16, d26s16, q0s16); |  | ||||||
|             __builtin_prefetch(d); |  | ||||||
|             __builtin_prefetch(d + dst_stride); |  | ||||||
|             q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, |  | ||||||
|                                     d22s16, d24s16, d26s16, d27s16, q0s16); |  | ||||||
|             __builtin_prefetch(d + dst_stride * 2); |  | ||||||
|             __builtin_prefetch(d + dst_stride * 3); |  | ||||||
|             q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, |  | ||||||
|                                     d24s16, d26s16, d27s16, d25s16, q0s16); |  | ||||||
|  |  | ||||||
|             d2u16 = vqrshrun_n_s32(q1s32, 7); |  | ||||||
|             d3u16 = vqrshrun_n_s32(q2s32, 7); |  | ||||||
|             d4u16 = vqrshrun_n_s32(q14s32, 7); |  | ||||||
|             d5u16 = vqrshrun_n_s32(q15s32, 7); |  | ||||||
|  |  | ||||||
|             q1u16 = vcombine_u16(d2u16, d3u16); |  | ||||||
|             q2u16 = vcombine_u16(d4u16, d5u16); |  | ||||||
|  |  | ||||||
|             d2u8 = vqmovn_u16(q1u16); |  | ||||||
|             d3u8 = vqmovn_u16(q2u16); |  | ||||||
|  |  | ||||||
|             q1u8 = vcombine_u8(d2u8, d3u8); |  | ||||||
|             q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); |  | ||||||
|  |  | ||||||
|             q1u8 = vrhaddq_u8(q1u8, q3u8); |  | ||||||
|  |  | ||||||
|             d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); |  | ||||||
|             d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); |  | ||||||
|  |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             q8u16 = q10u16; |  | ||||||
|             d18s16 = d22s16; |  | ||||||
|             d19s16 = d24s16; |  | ||||||
|             q10u16 = q13u16; |  | ||||||
|             d22s16 = d25s16; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } |  | ||||||
| @@ -1,357 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <stddef.h> |  | ||||||
| #include <arm_neon.h> |  | ||||||
|  |  | ||||||
| #include "./vpx_config.h" |  | ||||||
| #include "vpx_ports/mem.h" |  | ||||||
|  |  | ||||||
| void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, |  | ||||||
|                            uint8_t *dst, ptrdiff_t dst_stride, |  | ||||||
|                            const int16_t *filter_x, int x_step_q4, |  | ||||||
|                            const int16_t *filter_y, int y_step_q4, |  | ||||||
|                            int w, int h); |  | ||||||
| void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, |  | ||||||
|                            uint8_t *dst, ptrdiff_t dst_stride, |  | ||||||
|                            const int16_t *filter_x, int x_step_q4, |  | ||||||
|                            const int16_t *filter_y, int y_step_q4, |  | ||||||
|                            int w, int h); |  | ||||||
|  |  | ||||||
| static INLINE int32x4_t MULTIPLY_BY_Q0( |  | ||||||
|         int16x4_t dsrc0, |  | ||||||
|         int16x4_t dsrc1, |  | ||||||
|         int16x4_t dsrc2, |  | ||||||
|         int16x4_t dsrc3, |  | ||||||
|         int16x4_t dsrc4, |  | ||||||
|         int16x4_t dsrc5, |  | ||||||
|         int16x4_t dsrc6, |  | ||||||
|         int16x4_t dsrc7, |  | ||||||
|         int16x8_t q0s16) { |  | ||||||
|     int32x4_t qdst; |  | ||||||
|     int16x4_t d0s16, d1s16; |  | ||||||
|  |  | ||||||
|     d0s16 = vget_low_s16(q0s16); |  | ||||||
|     d1s16 = vget_high_s16(q0s16); |  | ||||||
|  |  | ||||||
|     qdst = vmull_lane_s16(dsrc0, d0s16, 0); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); |  | ||||||
|     qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); |  | ||||||
|     return qdst; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp9_convolve8_horiz_neon( |  | ||||||
|         uint8_t *src, |  | ||||||
|         ptrdiff_t src_stride, |  | ||||||
|         uint8_t *dst, |  | ||||||
|         ptrdiff_t dst_stride, |  | ||||||
|         const int16_t *filter_x, |  | ||||||
|         int x_step_q4, |  | ||||||
|         const int16_t *filter_y,  // unused |  | ||||||
|         int y_step_q4,            // unused |  | ||||||
|         int w, |  | ||||||
|         int h) { |  | ||||||
|     int width; |  | ||||||
|     uint8_t *s, *d, *psrc, *pdst; |  | ||||||
|     uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; |  | ||||||
|     uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; |  | ||||||
|     uint8x16_t q12u8, q13u8, q14u8, q15u8; |  | ||||||
|     int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; |  | ||||||
|     int16x4_t d24s16, d25s16, d26s16, d27s16; |  | ||||||
|     uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; |  | ||||||
|     int16x8_t q0s16; |  | ||||||
|     uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |  | ||||||
|     int32x4_t q1s32, q2s32, q14s32, q15s32; |  | ||||||
|     uint16x8x2_t q0x2u16; |  | ||||||
|     uint8x8x2_t d0x2u8, d1x2u8; |  | ||||||
|     uint32x2x2_t d0x2u32; |  | ||||||
|     uint16x4x2_t d0x2u16, d1x2u16; |  | ||||||
|     uint32x4x2_t q0x2u32; |  | ||||||
|  |  | ||||||
|     if (x_step_q4 != 16) { |  | ||||||
|         vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |  | ||||||
|                               filter_x, x_step_q4, |  | ||||||
|                               filter_y, y_step_q4, w, h); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     q0s16 = vld1q_s16(filter_x); |  | ||||||
|  |  | ||||||
|     src -= 3;  // adjust for taps |  | ||||||
|     for (; h > 0; h -= 4, |  | ||||||
|         src += src_stride * 4, |  | ||||||
|         dst += dst_stride * 4) {  // loop_horiz_v |  | ||||||
|         s = src; |  | ||||||
|         d24u8 = vld1_u8(s); |  | ||||||
|         s += src_stride; |  | ||||||
|         d25u8 = vld1_u8(s); |  | ||||||
|         s += src_stride; |  | ||||||
|         d26u8 = vld1_u8(s); |  | ||||||
|         s += src_stride; |  | ||||||
|         d27u8 = vld1_u8(s); |  | ||||||
|  |  | ||||||
|         q12u8 = vcombine_u8(d24u8, d25u8); |  | ||||||
|         q13u8 = vcombine_u8(d26u8, d27u8); |  | ||||||
|  |  | ||||||
|         q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), |  | ||||||
|                             vreinterpretq_u16_u8(q13u8)); |  | ||||||
|         d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); |  | ||||||
|         d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); |  | ||||||
|         d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); |  | ||||||
|         d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); |  | ||||||
|         d0x2u8 = vtrn_u8(d24u8, d25u8); |  | ||||||
|         d1x2u8 = vtrn_u8(d26u8, d27u8); |  | ||||||
|  |  | ||||||
|         __builtin_prefetch(src + src_stride * 4); |  | ||||||
|         __builtin_prefetch(src + src_stride * 5); |  | ||||||
|         __builtin_prefetch(src + src_stride * 6); |  | ||||||
|  |  | ||||||
|         q8u16  = vmovl_u8(d0x2u8.val[0]); |  | ||||||
|         q9u16  = vmovl_u8(d0x2u8.val[1]); |  | ||||||
|         q10u16 = vmovl_u8(d1x2u8.val[0]); |  | ||||||
|         q11u16 = vmovl_u8(d1x2u8.val[1]); |  | ||||||
|  |  | ||||||
|         d16u16 = vget_low_u16(q8u16); |  | ||||||
|         d17u16 = vget_high_u16(q8u16); |  | ||||||
|         d18u16 = vget_low_u16(q9u16); |  | ||||||
|         d19u16 = vget_high_u16(q9u16); |  | ||||||
|         q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18 |  | ||||||
|         q9u16 = vcombine_u16(d17u16, d19u16); |  | ||||||
|  |  | ||||||
|         d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); |  | ||||||
|         d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21 |  | ||||||
|         for (width = w, psrc = src + 7, pdst = dst; |  | ||||||
|              width > 0; |  | ||||||
|              width -= 4, psrc += 4, pdst += 4) {  // loop_horiz |  | ||||||
|             s = psrc; |  | ||||||
|             d28u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|             s += src_stride; |  | ||||||
|             d29u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|             s += src_stride; |  | ||||||
|             d31u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|             s += src_stride; |  | ||||||
|             d30u32 = vld1_dup_u32((const uint32_t *)s); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(psrc + 64); |  | ||||||
|  |  | ||||||
|             d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), |  | ||||||
|                                vreinterpret_u16_u32(d31u32)); |  | ||||||
|             d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), |  | ||||||
|                                vreinterpret_u16_u32(d30u32)); |  | ||||||
|             d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28 |  | ||||||
|                              vreinterpret_u8_u16(d1x2u16.val[0]));  // d29 |  | ||||||
|             d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31 |  | ||||||
|                              vreinterpret_u8_u16(d1x2u16.val[1]));  // d30 |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(psrc + 64 + src_stride); |  | ||||||
|  |  | ||||||
|             q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); |  | ||||||
|             q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); |  | ||||||
|             q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), |  | ||||||
|                                 vreinterpretq_u32_u8(q15u8)); |  | ||||||
|  |  | ||||||
|             d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); |  | ||||||
|             d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); |  | ||||||
|             q12u16 = vmovl_u8(d28u8); |  | ||||||
|             q13u16 = vmovl_u8(d29u8); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(psrc + 64 + src_stride * 2); |  | ||||||
|  |  | ||||||
|             d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); |  | ||||||
|             d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); |  | ||||||
|             d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); |  | ||||||
|             d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); |  | ||||||
|             d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |  | ||||||
|             d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |  | ||||||
|             d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |  | ||||||
|             d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); |  | ||||||
|             d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); |  | ||||||
|  |  | ||||||
|             q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, |  | ||||||
|                                     d18s16, d19s16, d23s16, d24s16, q0s16); |  | ||||||
|             q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, |  | ||||||
|                                     d19s16, d23s16, d24s16, d26s16, q0s16); |  | ||||||
|             q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, |  | ||||||
|                                     d23s16, d24s16, d26s16, d27s16, q0s16); |  | ||||||
|             q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, |  | ||||||
|                                     d24s16, d26s16, d27s16, d25s16, q0s16); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(psrc + 60 + src_stride * 3); |  | ||||||
|  |  | ||||||
|             d2u16 = vqrshrun_n_s32(q1s32, 7); |  | ||||||
|             d3u16 = vqrshrun_n_s32(q2s32, 7); |  | ||||||
|             d4u16 = vqrshrun_n_s32(q14s32, 7); |  | ||||||
|             d5u16 = vqrshrun_n_s32(q15s32, 7); |  | ||||||
|  |  | ||||||
|             q1u16 = vcombine_u16(d2u16, d3u16); |  | ||||||
|             q2u16 = vcombine_u16(d4u16, d5u16); |  | ||||||
|  |  | ||||||
|             d2u8 = vqmovn_u16(q1u16); |  | ||||||
|             d3u8 = vqmovn_u16(q2u16); |  | ||||||
|  |  | ||||||
|             d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), |  | ||||||
|                                vreinterpret_u16_u8(d3u8)); |  | ||||||
|             d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), |  | ||||||
|                                vreinterpret_u32_u16(d0x2u16.val[1])); |  | ||||||
|             d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), |  | ||||||
|                              vreinterpret_u8_u32(d0x2u32.val[1])); |  | ||||||
|  |  | ||||||
|             d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); |  | ||||||
|             d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); |  | ||||||
|  |  | ||||||
|             d = pdst; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 1); |  | ||||||
|  |  | ||||||
|             q8u16 = q9u16; |  | ||||||
|             d20s16 = d23s16; |  | ||||||
|             q11u16 = q12u16; |  | ||||||
|             q9u16 = q13u16; |  | ||||||
|             d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp9_convolve8_vert_neon( |  | ||||||
|         uint8_t *src, |  | ||||||
|         ptrdiff_t src_stride, |  | ||||||
|         uint8_t *dst, |  | ||||||
|         ptrdiff_t dst_stride, |  | ||||||
|         const int16_t *filter_x,  // unused |  | ||||||
|         int x_step_q4,            // unused |  | ||||||
|         const int16_t *filter_y, |  | ||||||
|         int y_step_q4, |  | ||||||
|         int w, |  | ||||||
|         int h) { |  | ||||||
|     int height; |  | ||||||
|     uint8_t *s, *d; |  | ||||||
|     uint32x2_t d2u32, d3u32; |  | ||||||
|     uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; |  | ||||||
|     int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; |  | ||||||
|     int16x4_t d24s16, d25s16, d26s16, d27s16; |  | ||||||
|     uint16x4_t d2u16, d3u16, d4u16, d5u16; |  | ||||||
|     int16x8_t q0s16; |  | ||||||
|     uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |  | ||||||
|     int32x4_t q1s32, q2s32, q14s32, q15s32; |  | ||||||
|  |  | ||||||
|     if (y_step_q4 != 16) { |  | ||||||
|         vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |  | ||||||
|                              filter_x, x_step_q4, |  | ||||||
|                              filter_y, y_step_q4, w, h); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     src -= src_stride * 3; |  | ||||||
|     q0s16 = vld1q_s16(filter_y); |  | ||||||
|     for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h |  | ||||||
|         s = src; |  | ||||||
|         d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|         d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); |  | ||||||
|         s += src_stride; |  | ||||||
|         d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|         d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); |  | ||||||
|         s += src_stride; |  | ||||||
|         d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|         d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); |  | ||||||
|         s += src_stride; |  | ||||||
|         d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); |  | ||||||
|         s += src_stride; |  | ||||||
|  |  | ||||||
|         q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32)); |  | ||||||
|         q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32)); |  | ||||||
|         q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); |  | ||||||
|         q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); |  | ||||||
|  |  | ||||||
|         d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); |  | ||||||
|         d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); |  | ||||||
|         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |  | ||||||
|         d = dst; |  | ||||||
|         for (height = h; height > 0; height -= 4) {  // loop_vert |  | ||||||
|             d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); |  | ||||||
|             s += src_stride; |  | ||||||
|             d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); |  | ||||||
|             s += src_stride; |  | ||||||
|             d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); |  | ||||||
|             s += src_stride; |  | ||||||
|             d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); |  | ||||||
|             s += src_stride; |  | ||||||
|  |  | ||||||
|             q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); |  | ||||||
|             q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); |  | ||||||
|  |  | ||||||
|             d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); |  | ||||||
|             d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); |  | ||||||
|             d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); |  | ||||||
|             d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); |  | ||||||
|             d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |  | ||||||
|             d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |  | ||||||
|             d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); |  | ||||||
|             d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); |  | ||||||
|  |  | ||||||
|             __builtin_prefetch(d); |  | ||||||
|             __builtin_prefetch(d + dst_stride); |  | ||||||
|             q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, |  | ||||||
|                                     d20s16, d21s16, d22s16, d24s16, q0s16); |  | ||||||
|             __builtin_prefetch(d + dst_stride * 2); |  | ||||||
|             __builtin_prefetch(d + dst_stride * 3); |  | ||||||
|             q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, |  | ||||||
|                                     d21s16, d22s16, d24s16, d26s16, q0s16); |  | ||||||
|             __builtin_prefetch(s); |  | ||||||
|             __builtin_prefetch(s + src_stride); |  | ||||||
|             q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, |  | ||||||
|                                     d22s16, d24s16, d26s16, d27s16, q0s16); |  | ||||||
|             __builtin_prefetch(s + src_stride * 2); |  | ||||||
|             __builtin_prefetch(s + src_stride * 3); |  | ||||||
|             q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, |  | ||||||
|                                     d24s16, d26s16, d27s16, d25s16, q0s16); |  | ||||||
|  |  | ||||||
|             d2u16 = vqrshrun_n_s32(q1s32, 7); |  | ||||||
|             d3u16 = vqrshrun_n_s32(q2s32, 7); |  | ||||||
|             d4u16 = vqrshrun_n_s32(q14s32, 7); |  | ||||||
|             d5u16 = vqrshrun_n_s32(q15s32, 7); |  | ||||||
|  |  | ||||||
|             q1u16 = vcombine_u16(d2u16, d3u16); |  | ||||||
|             q2u16 = vcombine_u16(d4u16, d5u16); |  | ||||||
|  |  | ||||||
|             d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); |  | ||||||
|             d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); |  | ||||||
|  |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d2u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)d, d3u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             q8u16 = q10u16; |  | ||||||
|             d18s16 = d22s16; |  | ||||||
|             d19s16 = d24s16; |  | ||||||
|             q10u16 = q13u16; |  | ||||||
|             d22s16 = d25s16; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } |  | ||||||
| @@ -1,145 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <stddef.h> |  | ||||||
| #include <arm_neon.h> |  | ||||||
|  |  | ||||||
| void vp9_convolve_avg_neon( |  | ||||||
|         const uint8_t *src,    // r0 |  | ||||||
|         ptrdiff_t src_stride,  // r1 |  | ||||||
|         uint8_t *dst,          // r2 |  | ||||||
|         ptrdiff_t dst_stride,  // r3 |  | ||||||
|         const int16_t *filter_x, |  | ||||||
|         int filter_x_stride, |  | ||||||
|         const int16_t *filter_y, |  | ||||||
|         int filter_y_stride, |  | ||||||
|         int w, |  | ||||||
|         int h) { |  | ||||||
|     uint8_t *d; |  | ||||||
|     uint8x8_t d0u8, d1u8, d2u8, d3u8; |  | ||||||
|     uint32x2_t d0u32, d2u32; |  | ||||||
|     uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; |  | ||||||
|     (void)filter_x;  (void)filter_x_stride; |  | ||||||
|     (void)filter_y;  (void)filter_y_stride; |  | ||||||
|  |  | ||||||
|     d = dst; |  | ||||||
|     if (w > 32) {  // avg64 |  | ||||||
|         for (; h > 0; h -= 1) { |  | ||||||
|             q0u8  = vld1q_u8(src); |  | ||||||
|             q1u8  = vld1q_u8(src + 16); |  | ||||||
|             q2u8  = vld1q_u8(src + 32); |  | ||||||
|             q3u8  = vld1q_u8(src + 48); |  | ||||||
|             src += src_stride; |  | ||||||
|             q8u8  = vld1q_u8(d); |  | ||||||
|             q9u8  = vld1q_u8(d + 16); |  | ||||||
|             q10u8 = vld1q_u8(d + 32); |  | ||||||
|             q11u8 = vld1q_u8(d + 48); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             q0u8 = vrhaddq_u8(q0u8, q8u8); |  | ||||||
|             q1u8 = vrhaddq_u8(q1u8, q9u8); |  | ||||||
|             q2u8 = vrhaddq_u8(q2u8, q10u8); |  | ||||||
|             q3u8 = vrhaddq_u8(q3u8, q11u8); |  | ||||||
|  |  | ||||||
|             vst1q_u8(dst, q0u8); |  | ||||||
|             vst1q_u8(dst + 16, q1u8); |  | ||||||
|             vst1q_u8(dst + 32, q2u8); |  | ||||||
|             vst1q_u8(dst + 48, q3u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else if (w == 32) {  // avg32 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             q0u8 = vld1q_u8(src); |  | ||||||
|             q1u8 = vld1q_u8(src + 16); |  | ||||||
|             src += src_stride; |  | ||||||
|             q2u8 = vld1q_u8(src); |  | ||||||
|             q3u8 = vld1q_u8(src + 16); |  | ||||||
|             src += src_stride; |  | ||||||
|             q8u8 = vld1q_u8(d); |  | ||||||
|             q9u8 = vld1q_u8(d + 16); |  | ||||||
|             d += dst_stride; |  | ||||||
|             q10u8 = vld1q_u8(d); |  | ||||||
|             q11u8 = vld1q_u8(d + 16); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             q0u8 = vrhaddq_u8(q0u8, q8u8); |  | ||||||
|             q1u8 = vrhaddq_u8(q1u8, q9u8); |  | ||||||
|             q2u8 = vrhaddq_u8(q2u8, q10u8); |  | ||||||
|             q3u8 = vrhaddq_u8(q3u8, q11u8); |  | ||||||
|  |  | ||||||
|             vst1q_u8(dst, q0u8); |  | ||||||
|             vst1q_u8(dst + 16, q1u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1q_u8(dst, q2u8); |  | ||||||
|             vst1q_u8(dst + 16, q3u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else if (w > 8) {  // avg16 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             q0u8 = vld1q_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|             q1u8 = vld1q_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|             q2u8 = vld1q_u8(d); |  | ||||||
|             d += dst_stride; |  | ||||||
|             q3u8 = vld1q_u8(d); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             q0u8 = vrhaddq_u8(q0u8, q2u8); |  | ||||||
|             q1u8 = vrhaddq_u8(q1u8, q3u8); |  | ||||||
|  |  | ||||||
|             vst1q_u8(dst, q0u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1q_u8(dst, q1u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else if (w == 8) {  // avg8 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             d0u8 = vld1_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|             d1u8 = vld1_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|             d2u8 = vld1_u8(d); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d3u8 = vld1_u8(d); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             q0u8 = vcombine_u8(d0u8, d1u8); |  | ||||||
|             q1u8 = vcombine_u8(d2u8, d3u8); |  | ||||||
|             q0u8 = vrhaddq_u8(q0u8, q1u8); |  | ||||||
|  |  | ||||||
|             vst1_u8(dst, vget_low_u8(q0u8)); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1_u8(dst, vget_high_u8(q0u8)); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else {  // avg4 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); |  | ||||||
|             src += src_stride; |  | ||||||
|             d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); |  | ||||||
|             src += src_stride; |  | ||||||
|             d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); |  | ||||||
|             d += dst_stride; |  | ||||||
|             d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); |  | ||||||
|             d += dst_stride; |  | ||||||
|  |  | ||||||
|             d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), |  | ||||||
|                              vreinterpret_u8_u32(d2u32)); |  | ||||||
|  |  | ||||||
|             d0u32 = vreinterpret_u32_u8(d0u8); |  | ||||||
|             vst1_lane_u32((uint32_t *)dst, d0u32, 0); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1_lane_u32((uint32_t *)dst, d0u32, 1); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } |  | ||||||
| @@ -1,92 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <stddef.h> |  | ||||||
| #include <arm_neon.h> |  | ||||||
|  |  | ||||||
| void vp9_convolve_copy_neon( |  | ||||||
|         const uint8_t *src,    // r0 |  | ||||||
|         ptrdiff_t src_stride,  // r1 |  | ||||||
|         uint8_t *dst,          // r2 |  | ||||||
|         ptrdiff_t dst_stride,  // r3 |  | ||||||
|         const int16_t *filter_x, |  | ||||||
|         int filter_x_stride, |  | ||||||
|         const int16_t *filter_y, |  | ||||||
|         int filter_y_stride, |  | ||||||
|         int w, |  | ||||||
|         int h) { |  | ||||||
|     uint8x8_t d0u8, d2u8; |  | ||||||
|     uint8x16_t q0u8, q1u8, q2u8, q3u8; |  | ||||||
|     (void)filter_x;  (void)filter_x_stride; |  | ||||||
|     (void)filter_y;  (void)filter_y_stride; |  | ||||||
|  |  | ||||||
|     if (w > 32) {  // copy64 |  | ||||||
|         for (; h > 0; h--) { |  | ||||||
|             q0u8 = vld1q_u8(src); |  | ||||||
|             q1u8 = vld1q_u8(src + 16); |  | ||||||
|             q2u8 = vld1q_u8(src + 32); |  | ||||||
|             q3u8 = vld1q_u8(src + 48); |  | ||||||
|             src += src_stride; |  | ||||||
|  |  | ||||||
|             vst1q_u8(dst, q0u8); |  | ||||||
|             vst1q_u8(dst + 16, q1u8); |  | ||||||
|             vst1q_u8(dst + 32, q2u8); |  | ||||||
|             vst1q_u8(dst + 48, q3u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else if (w == 32) {  // copy32 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             q0u8 = vld1q_u8(src); |  | ||||||
|             q1u8 = vld1q_u8(src + 16); |  | ||||||
|             src += src_stride; |  | ||||||
|             q2u8 = vld1q_u8(src); |  | ||||||
|             q3u8 = vld1q_u8(src + 16); |  | ||||||
|             src += src_stride; |  | ||||||
|  |  | ||||||
|             vst1q_u8(dst, q0u8); |  | ||||||
|             vst1q_u8(dst + 16, q1u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1q_u8(dst, q2u8); |  | ||||||
|             vst1q_u8(dst + 16, q3u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else if (w > 8) {  // copy16 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             q0u8 = vld1q_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|             q1u8 = vld1q_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|  |  | ||||||
|             vst1q_u8(dst, q0u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1q_u8(dst, q1u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else if (w == 8) {  // copy8 |  | ||||||
|         for (; h > 0; h -= 2) { |  | ||||||
|             d0u8 = vld1_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|             d2u8 = vld1_u8(src); |  | ||||||
|             src += src_stride; |  | ||||||
|  |  | ||||||
|             vst1_u8(dst, d0u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|             vst1_u8(dst, d2u8); |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } else {  // copy4 |  | ||||||
|         for (; h > 0; h--) { |  | ||||||
|             *(uint32_t *)dst = *(const uint32_t *)src; |  | ||||||
|             src += src_stride; |  | ||||||
|             dst += dst_stride; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } |  | ||||||
| @@ -11,9 +11,10 @@ | |||||||
| #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ | #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ | ||||||
| #define VP9_COMMON_VP9_ENTROPYMODE_H_ | #define VP9_COMMON_VP9_ENTROPYMODE_H_ | ||||||
|  |  | ||||||
| #include "vp9/common/vp9_filter.h" |  | ||||||
| #include "vp9/common/vp9_entropy.h" | #include "vp9/common/vp9_entropy.h" | ||||||
| #include "vp9/common/vp9_entropymv.h" | #include "vp9/common/vp9_entropymv.h" | ||||||
|  | #include "vp9/common/vp9_filter.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
|  |  | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| extern "C" { | extern "C" { | ||||||
|   | |||||||
| @@ -13,6 +13,7 @@ | |||||||
|  |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -20,13 +21,6 @@ | |||||||
| extern "C" { | extern "C" { | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #define FILTER_BITS 7 |  | ||||||
|  |  | ||||||
| #define SUBPEL_BITS 4 |  | ||||||
| #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) |  | ||||||
| #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) |  | ||||||
| #define SUBPEL_TAPS 8 |  | ||||||
|  |  | ||||||
| #define EIGHTTAP            0 | #define EIGHTTAP            0 | ||||||
| #define EIGHTTAP_SMOOTH     1 | #define EIGHTTAP_SMOOTH     1 | ||||||
| #define EIGHTTAP_SHARP      2 | #define EIGHTTAP_SHARP      2 | ||||||
| @@ -36,9 +30,8 @@ extern "C" { | |||||||
| // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. | // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. | ||||||
| #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) | #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) | ||||||
| #define SWITCHABLE 4 /* should be the last one */ | #define SWITCHABLE 4 /* should be the last one */ | ||||||
| typedef uint8_t INTERP_FILTER; |  | ||||||
|  |  | ||||||
| typedef int16_t InterpKernel[SUBPEL_TAPS]; | typedef uint8_t INTERP_FILTER; | ||||||
|  |  | ||||||
| extern const InterpKernel *vp9_filter_kernels[4]; | extern const InterpKernel *vp9_filter_kernels[4]; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -15,6 +15,9 @@ | |||||||
|  |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "vpx_dsp/txfm_common.h" | #include "vpx_dsp/txfm_common.h" | ||||||
|  | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|  | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
|  | #endif  // CONFIG_VP9_HIGHBITDEPTH | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| #include "vp9/common/vp9_common.h" | #include "vp9/common/vp9_common.h" | ||||||
| #include "vp9/common/vp9_enums.h" | #include "vp9/common/vp9_enums.h" | ||||||
|   | |||||||
| @@ -16,7 +16,6 @@ | |||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
|  |  | ||||||
| #include "vp9/common/vp9_blockd.h" | #include "vp9/common/vp9_blockd.h" | ||||||
| #include "vp9/common/vp9_filter.h" |  | ||||||
| #include "vp9/common/vp9_reconinter.h" | #include "vp9/common/vp9_reconinter.h" | ||||||
| #include "vp9/common/vp9_reconintra.h" | #include "vp9/common/vp9_reconintra.h" | ||||||
|  |  | ||||||
|   | |||||||
| @@ -11,8 +11,10 @@ | |||||||
| #ifndef VP9_COMMON_VP9_RECONINTER_H_ | #ifndef VP9_COMMON_VP9_RECONINTER_H_ | ||||||
| #define VP9_COMMON_VP9_RECONINTER_H_ | #define VP9_COMMON_VP9_RECONINTER_H_ | ||||||
|  |  | ||||||
| #include "vpx/vpx_integer.h" | #include "vp9/common/vp9_filter.h" | ||||||
| #include "vp9/common/vp9_onyxc_int.h" | #include "vp9/common/vp9_onyxc_int.h" | ||||||
|  | #include "vpx/vpx_integer.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
|  |  | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| extern "C" { | extern "C" { | ||||||
|   | |||||||
| @@ -11,6 +11,9 @@ | |||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "./vpx_dsp_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
|  |  | ||||||
|  | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|  | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
|  | #endif  // CONFIG_VP9_HIGHBITDEPTH | ||||||
| #include "vpx_mem/vpx_mem.h" | #include "vpx_mem/vpx_mem.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| #include "vpx_ports/vpx_once.h" | #include "vpx_ports/vpx_once.h" | ||||||
|   | |||||||
| @@ -54,12 +54,6 @@ if ($opts{arch} eq "x86_64") { | |||||||
|   $avx2_x86_64 = 'avx2'; |   $avx2_x86_64 = 'avx2'; | ||||||
| } | } | ||||||
|  |  | ||||||
| # optimizations which depend on multiple features |  | ||||||
| $avx2_ssse3 = ''; |  | ||||||
| if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) { |  | ||||||
|   $avx2_ssse3 = 'avx2'; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # post proc | # post proc | ||||||
| # | # | ||||||
| @@ -87,33 +81,6 @@ add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, | |||||||
| specialize qw/vp9_filter_by_weight8x8 sse2 msa/; | specialize qw/vp9_filter_by_weight8x8 sse2 msa/; | ||||||
| } | } | ||||||
|  |  | ||||||
| # |  | ||||||
| # Sub Pixel Filters |  | ||||||
| # |  | ||||||
| add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc"; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc"; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2 msa/; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; |  | ||||||
| specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/; |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # dct | # dct | ||||||
| # | # | ||||||
|   | |||||||
| @@ -8,9 +8,10 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/vp9_filter.h" | #include "vp9/common/vp9_filter.h" | ||||||
| #include "vp9/common/vp9_scale.h" | #include "vp9/common/vp9_scale.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
|  |  | ||||||
| static INLINE int scaled_x(int val, const struct scale_factors *sf) { | static INLINE int scaled_x(int val, const struct scale_factors *sf) { | ||||||
|   return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT); |   return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT); | ||||||
| @@ -81,85 +82,85 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, | |||||||
|   if (sf->x_step_q4 == 16) { |   if (sf->x_step_q4 == 16) { | ||||||
|     if (sf->y_step_q4 == 16) { |     if (sf->y_step_q4 == 16) { | ||||||
|       // No scaling in either direction. |       // No scaling in either direction. | ||||||
|       sf->predict[0][0][0] = vp9_convolve_copy; |       sf->predict[0][0][0] = vpx_convolve_copy; | ||||||
|       sf->predict[0][0][1] = vp9_convolve_avg; |       sf->predict[0][0][1] = vpx_convolve_avg; | ||||||
|       sf->predict[0][1][0] = vp9_convolve8_vert; |       sf->predict[0][1][0] = vpx_convolve8_vert; | ||||||
|       sf->predict[0][1][1] = vp9_convolve8_avg_vert; |       sf->predict[0][1][1] = vpx_convolve8_avg_vert; | ||||||
|       sf->predict[1][0][0] = vp9_convolve8_horiz; |       sf->predict[1][0][0] = vpx_convolve8_horiz; | ||||||
|       sf->predict[1][0][1] = vp9_convolve8_avg_horiz; |       sf->predict[1][0][1] = vpx_convolve8_avg_horiz; | ||||||
|     } else { |     } else { | ||||||
|       // No scaling in x direction. Must always scale in the y direction. |       // No scaling in x direction. Must always scale in the y direction. | ||||||
|       sf->predict[0][0][0] = vp9_convolve8_vert; |       sf->predict[0][0][0] = vpx_convolve8_vert; | ||||||
|       sf->predict[0][0][1] = vp9_convolve8_avg_vert; |       sf->predict[0][0][1] = vpx_convolve8_avg_vert; | ||||||
|       sf->predict[0][1][0] = vp9_convolve8_vert; |       sf->predict[0][1][0] = vpx_convolve8_vert; | ||||||
|       sf->predict[0][1][1] = vp9_convolve8_avg_vert; |       sf->predict[0][1][1] = vpx_convolve8_avg_vert; | ||||||
|       sf->predict[1][0][0] = vp9_convolve8; |       sf->predict[1][0][0] = vpx_convolve8; | ||||||
|       sf->predict[1][0][1] = vp9_convolve8_avg; |       sf->predict[1][0][1] = vpx_convolve8_avg; | ||||||
|     } |     } | ||||||
|   } else { |   } else { | ||||||
|     if (sf->y_step_q4 == 16) { |     if (sf->y_step_q4 == 16) { | ||||||
|       // No scaling in the y direction. Must always scale in the x direction. |       // No scaling in the y direction. Must always scale in the x direction. | ||||||
|       sf->predict[0][0][0] = vp9_convolve8_horiz; |       sf->predict[0][0][0] = vpx_convolve8_horiz; | ||||||
|       sf->predict[0][0][1] = vp9_convolve8_avg_horiz; |       sf->predict[0][0][1] = vpx_convolve8_avg_horiz; | ||||||
|       sf->predict[0][1][0] = vp9_convolve8; |       sf->predict[0][1][0] = vpx_convolve8; | ||||||
|       sf->predict[0][1][1] = vp9_convolve8_avg; |       sf->predict[0][1][1] = vpx_convolve8_avg; | ||||||
|       sf->predict[1][0][0] = vp9_convolve8_horiz; |       sf->predict[1][0][0] = vpx_convolve8_horiz; | ||||||
|       sf->predict[1][0][1] = vp9_convolve8_avg_horiz; |       sf->predict[1][0][1] = vpx_convolve8_avg_horiz; | ||||||
|     } else { |     } else { | ||||||
|       // Must always scale in both directions. |       // Must always scale in both directions. | ||||||
|       sf->predict[0][0][0] = vp9_convolve8; |       sf->predict[0][0][0] = vpx_convolve8; | ||||||
|       sf->predict[0][0][1] = vp9_convolve8_avg; |       sf->predict[0][0][1] = vpx_convolve8_avg; | ||||||
|       sf->predict[0][1][0] = vp9_convolve8; |       sf->predict[0][1][0] = vpx_convolve8; | ||||||
|       sf->predict[0][1][1] = vp9_convolve8_avg; |       sf->predict[0][1][1] = vpx_convolve8_avg; | ||||||
|       sf->predict[1][0][0] = vp9_convolve8; |       sf->predict[1][0][0] = vpx_convolve8; | ||||||
|       sf->predict[1][0][1] = vp9_convolve8_avg; |       sf->predict[1][0][1] = vpx_convolve8_avg; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   // 2D subpel motion always gets filtered in both directions |   // 2D subpel motion always gets filtered in both directions | ||||||
|   sf->predict[1][1][0] = vp9_convolve8; |   sf->predict[1][1][0] = vpx_convolve8; | ||||||
|   sf->predict[1][1][1] = vp9_convolve8_avg; |   sf->predict[1][1][1] = vpx_convolve8_avg; | ||||||
| #if CONFIG_VP9_HIGHBITDEPTH | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|   if (use_highbd) { |   if (use_highbd) { | ||||||
|     if (sf->x_step_q4 == 16) { |     if (sf->x_step_q4 == 16) { | ||||||
|       if (sf->y_step_q4 == 16) { |       if (sf->y_step_q4 == 16) { | ||||||
|         // No scaling in either direction. |         // No scaling in either direction. | ||||||
|         sf->highbd_predict[0][0][0] = vp9_highbd_convolve_copy; |         sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy; | ||||||
|         sf->highbd_predict[0][0][1] = vp9_highbd_convolve_avg; |         sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg; | ||||||
|         sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert; |         sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; | ||||||
|         sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert; |         sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; | ||||||
|         sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz; |         sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; | ||||||
|         sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz; |         sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; | ||||||
|       } else { |       } else { | ||||||
|         // No scaling in x direction. Must always scale in the y direction. |         // No scaling in x direction. Must always scale in the y direction. | ||||||
|         sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_vert; |         sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert; | ||||||
|         sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_vert; |         sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert; | ||||||
|         sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert; |         sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; | ||||||
|         sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert; |         sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; | ||||||
|         sf->highbd_predict[1][0][0] = vp9_highbd_convolve8; |         sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; | ||||||
|         sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg; |         sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; | ||||||
|       } |       } | ||||||
|     } else { |     } else { | ||||||
|       if (sf->y_step_q4 == 16) { |       if (sf->y_step_q4 == 16) { | ||||||
|         // No scaling in the y direction. Must always scale in the x direction. |         // No scaling in the y direction. Must always scale in the x direction. | ||||||
|         sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_horiz; |         sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz; | ||||||
|         sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_horiz; |         sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz; | ||||||
|         sf->highbd_predict[0][1][0] = vp9_highbd_convolve8; |         sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; | ||||||
|         sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg; |         sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; | ||||||
|         sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz; |         sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; | ||||||
|         sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz; |         sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; | ||||||
|       } else { |       } else { | ||||||
|         // Must always scale in both directions. |         // Must always scale in both directions. | ||||||
|         sf->highbd_predict[0][0][0] = vp9_highbd_convolve8; |         sf->highbd_predict[0][0][0] = vpx_highbd_convolve8; | ||||||
|         sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg; |         sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg; | ||||||
|         sf->highbd_predict[0][1][0] = vp9_highbd_convolve8; |         sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; | ||||||
|         sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg; |         sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; | ||||||
|         sf->highbd_predict[1][0][0] = vp9_highbd_convolve8; |         sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; | ||||||
|         sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg; |         sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     // 2D subpel motion always gets filtered in both directions. |     // 2D subpel motion always gets filtered in both directions. | ||||||
|     sf->highbd_predict[1][1][0] = vp9_highbd_convolve8; |     sf->highbd_predict[1][1][0] = vpx_highbd_convolve8; | ||||||
|     sf->highbd_predict[1][1][1] = vp9_highbd_convolve8_avg; |     sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg; | ||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
|   | |||||||
| @@ -12,7 +12,7 @@ | |||||||
| #define VP9_COMMON_VP9_SCALE_H_ | #define VP9_COMMON_VP9_SCALE_H_ | ||||||
|  |  | ||||||
| #include "vp9/common/vp9_mv.h" | #include "vp9/common/vp9_mv.h" | ||||||
| #include "vp9/common/vp9_convolve.h" | #include "vpx_dsp/vpx_convolve.h" | ||||||
|  |  | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| extern "C" { | extern "C" { | ||||||
|   | |||||||
| @@ -12,6 +12,7 @@ | |||||||
| #include <stdlib.h>  // qsort() | #include <stdlib.h>  // qsort() | ||||||
|  |  | ||||||
| #include "./vp9_rtcd.h" | #include "./vp9_rtcd.h" | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "./vpx_scale_rtcd.h" | #include "./vpx_scale_rtcd.h" | ||||||
|  |  | ||||||
| #include "vpx_dsp/bitreader_buffer.h" | #include "vpx_dsp/bitreader_buffer.h" | ||||||
|   | |||||||
| @@ -8,12 +8,14 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| #include "./vpx_config.h" |  | ||||||
| #include "./vp9_rtcd.h" | #include "./vp9_rtcd.h" | ||||||
|  | #include "./vpx_config.h" | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/vp9_common.h" | #include "vp9/common/vp9_common.h" | ||||||
| #include "vp9/common/vp9_convolve.h" |  | ||||||
| #include "vp9/common/vp9_filter.h" | #include "vp9/common/vp9_filter.h" | ||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
|  | #include "vpx_dsp/vpx_convolve.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
|  |  | ||||||
| static int horizontal_filter(const uint8_t *s) { | static int horizontal_filter(const uint8_t *s) { | ||||||
|   | |||||||
| @@ -10,6 +10,7 @@ | |||||||
|  |  | ||||||
| #include <assert.h> | #include <assert.h> | ||||||
| #include <limits.h> | #include <limits.h> | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vpx_scale/yv12config.h" | #include "vpx_scale/yv12config.h" | ||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
| #include "vp9/common/vp9_reconinter.h" | #include "vp9/common/vp9_reconinter.h" | ||||||
| @@ -336,12 +337,12 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (decision == FILTER_BLOCK) { |   if (decision == FILTER_BLOCK) { | ||||||
|     vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, |     vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, | ||||||
|                       NULL, 0, NULL, 0, |                       NULL, 0, NULL, 0, | ||||||
|                       num_4x4_blocks_wide_lookup[bs] << 2, |                       num_4x4_blocks_wide_lookup[bs] << 2, | ||||||
|                       num_4x4_blocks_high_lookup[bs] << 2); |                       num_4x4_blocks_high_lookup[bs] << 2); | ||||||
|   } else {  // COPY_BLOCK |   } else {  // COPY_BLOCK | ||||||
|     vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, |     vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, | ||||||
|                       NULL, 0, NULL, 0, |                       NULL, 0, NULL, 0, | ||||||
|                       num_4x4_blocks_wide_lookup[bs] << 2, |                       num_4x4_blocks_wide_lookup[bs] << 2, | ||||||
|                       num_4x4_blocks_high_lookup[bs] << 2); |                       num_4x4_blocks_high_lookup[bs] << 2); | ||||||
|   | |||||||
| @@ -12,11 +12,12 @@ | |||||||
| #include <stdio.h> | #include <stdio.h> | ||||||
| #include <limits.h> | #include <limits.h> | ||||||
|  |  | ||||||
| #include "./vpx_config.h" |  | ||||||
| #include "./vp9_rtcd.h" | #include "./vp9_rtcd.h" | ||||||
|  | #include "./vpx_config.h" | ||||||
| #include "./vpx_dsp_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "./vpx_scale_rtcd.h" | #include "./vpx_scale_rtcd.h" | ||||||
| #include "vpx/internal/vpx_psnr.h" | #include "vpx/internal/vpx_psnr.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| #include "vpx_ports/vpx_timer.h" | #include "vpx_ports/vpx_timer.h" | ||||||
| #include "vpx_scale/vpx_scale.h" | #include "vpx_scale/vpx_scale.h" | ||||||
| @@ -2580,18 +2581,18 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, | |||||||
|  |  | ||||||
| #if CONFIG_VP9_HIGHBITDEPTH | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|         if (src->flags & YV12_FLAG_HIGHBITDEPTH) { |         if (src->flags & YV12_FLAG_HIGHBITDEPTH) { | ||||||
|           vp9_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, |           vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, | ||||||
|                                kernel[x_q4 & 0xf], 16 * src_w / dst_w, |                                kernel[x_q4 & 0xf], 16 * src_w / dst_w, | ||||||
|                                kernel[y_q4 & 0xf], 16 * src_h / dst_h, |                                kernel[y_q4 & 0xf], 16 * src_h / dst_h, | ||||||
|                                16 / factor, 16 / factor, bd); |                                16 / factor, 16 / factor, bd); | ||||||
|         } else { |         } else { | ||||||
|           vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, |           vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, | ||||||
|                         kernel[x_q4 & 0xf], 16 * src_w / dst_w, |                         kernel[x_q4 & 0xf], 16 * src_w / dst_w, | ||||||
|                         kernel[y_q4 & 0xf], 16 * src_h / dst_h, |                         kernel[y_q4 & 0xf], 16 * src_h / dst_h, | ||||||
|                         16 / factor, 16 / factor); |                         16 / factor, 16 / factor); | ||||||
|         } |         } | ||||||
| #else | #else | ||||||
|         vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, |         vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, | ||||||
|                       kernel[x_q4 & 0xf], 16 * src_w / dst_w, |                       kernel[x_q4 & 0xf], 16 * src_w / dst_w, | ||||||
|                       kernel[y_q4 & 0xf], 16 * src_h / dst_h, |                       kernel[y_q4 & 0xf], 16 * src_h / dst_h, | ||||||
|                       16 / factor, 16 / factor); |                       16 / factor, 16 / factor); | ||||||
|   | |||||||
| @@ -1504,15 +1504,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, | |||||||
|         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; |         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; | ||||||
| #if CONFIG_VP9_HIGHBITDEPTH | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|         if (cm->use_highbitdepth) |         if (cm->use_highbitdepth) | ||||||
|           vp9_highbd_convolve_copy(best_pred->data, best_pred->stride, |           vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, | ||||||
|                                    this_mode_pred->data, this_mode_pred->stride, |                                    this_mode_pred->data, this_mode_pred->stride, | ||||||
|                                    NULL, 0, NULL, 0, bw, bh, xd->bd); |                                    NULL, 0, NULL, 0, bw, bh, xd->bd); | ||||||
|         else |         else | ||||||
|           vp9_convolve_copy(best_pred->data, best_pred->stride, |           vpx_convolve_copy(best_pred->data, best_pred->stride, | ||||||
|                           this_mode_pred->data, this_mode_pred->stride, |                           this_mode_pred->data, this_mode_pred->stride, | ||||||
|                           NULL, 0, NULL, 0, bw, bh); |                           NULL, 0, NULL, 0, bw, bh); | ||||||
| #else | #else | ||||||
|         vp9_convolve_copy(best_pred->data, best_pred->stride, |         vpx_convolve_copy(best_pred->data, best_pred->stride, | ||||||
|                           this_mode_pred->data, this_mode_pred->stride, |                           this_mode_pred->data, this_mode_pred->stride, | ||||||
|                           NULL, 0, NULL, 0, bw, bh); |                           NULL, 0, NULL, 0, bw, bh); | ||||||
| #endif  // CONFIG_VP9_HIGHBITDEPTH | #endif  // CONFIG_VP9_HIGHBITDEPTH | ||||||
| @@ -1577,15 +1577,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, | |||||||
|     if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) { |     if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) { | ||||||
| #if CONFIG_VP9_HIGHBITDEPTH | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|       if (cm->use_highbitdepth) |       if (cm->use_highbitdepth) | ||||||
|         vp9_highbd_convolve_copy(best_pred->data, best_pred->stride, |         vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, | ||||||
|                                  pd->dst.buf, pd->dst.stride, NULL, 0, |                                  pd->dst.buf, pd->dst.stride, NULL, 0, | ||||||
|                                  NULL, 0, bw, bh, xd->bd); |                                  NULL, 0, bw, bh, xd->bd); | ||||||
|       else |       else | ||||||
|         vp9_convolve_copy(best_pred->data, best_pred->stride, |         vpx_convolve_copy(best_pred->data, best_pred->stride, | ||||||
|                           pd->dst.buf, pd->dst.stride, NULL, 0, |                           pd->dst.buf, pd->dst.stride, NULL, 0, | ||||||
|                           NULL, 0, bw, bh); |                           NULL, 0, bw, bh); | ||||||
| #else | #else | ||||||
|       vp9_convolve_copy(best_pred->data, best_pred->stride, |       vpx_convolve_copy(best_pred->data, best_pred->stride, | ||||||
|                         pd->dst.buf, pd->dst.stride, NULL, 0, |                         pd->dst.buf, pd->dst.stride, NULL, 0, | ||||||
|                         NULL, 0, bw, bh); |                         NULL, 0, bw, bh); | ||||||
| #endif  // CONFIG_VP9_HIGHBITDEPTH | #endif  // CONFIG_VP9_HIGHBITDEPTH | ||||||
|   | |||||||
| @@ -15,6 +15,9 @@ | |||||||
| #include <stdlib.h> | #include <stdlib.h> | ||||||
| #include <string.h> | #include <string.h> | ||||||
|  |  | ||||||
|  | #if CONFIG_VP9_HIGHBITDEPTH | ||||||
|  | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
|  | #endif  // CONFIG_VP9_HIGHBITDEPTH | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| #include "vp9/common/vp9_common.h" | #include "vp9/common/vp9_common.h" | ||||||
| #include "vp9/encoder/vp9_resize.h" | #include "vp9/encoder/vp9_resize.h" | ||||||
|   | |||||||
| @@ -13,14 +13,10 @@ VP9_COMMON_SRCS-yes += vp9_iface_common.h | |||||||
| VP9_COMMON_SRCS-yes += common/vp9_ppflags.h | VP9_COMMON_SRCS-yes += common/vp9_ppflags.h | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c | VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_blockd.c | VP9_COMMON_SRCS-yes += common/vp9_blockd.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_convolve.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_convolve.h |  | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c | VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_entropy.c | VP9_COMMON_SRCS-yes += common/vp9_entropy.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_entropymode.c | VP9_COMMON_SRCS-yes += common/vp9_entropymode.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_entropymv.c | VP9_COMMON_SRCS-yes += common/vp9_entropymv.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_filter.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_filter.h |  | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c | VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h | VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_idct.c | VP9_COMMON_SRCS-yes += common/vp9_idct.c | ||||||
| @@ -31,6 +27,8 @@ VP9_COMMON_SRCS-yes += common/vp9_entropy.h | |||||||
| VP9_COMMON_SRCS-yes += common/vp9_entropymode.h | VP9_COMMON_SRCS-yes += common/vp9_entropymode.h | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_entropymv.h | VP9_COMMON_SRCS-yes += common/vp9_entropymv.h | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_enums.h | VP9_COMMON_SRCS-yes += common/vp9_enums.h | ||||||
|  | VP9_COMMON_SRCS-yes += common/vp9_filter.h | ||||||
|  | VP9_COMMON_SRCS-yes += common/vp9_filter.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_idct.h | VP9_COMMON_SRCS-yes += common/vp9_idct.h | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h | VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_thread_common.h | VP9_COMMON_SRCS-yes += common/vp9_thread_common.h | ||||||
| @@ -64,33 +62,16 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h | |||||||
| VP9_COMMON_SRCS-yes += common/vp9_scan.c | VP9_COMMON_SRCS-yes += common/vp9_scan.c | ||||||
| VP9_COMMON_SRCS-yes += common/vp9_scan.h | VP9_COMMON_SRCS-yes += common/vp9_scan.h | ||||||
|  |  | ||||||
| VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h |  | ||||||
| VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c |  | ||||||
| VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h | VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h | ||||||
| VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c | VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c | ||||||
| VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h | VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h | ||||||
| VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c | VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm | VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c |  | ||||||
| ifeq ($(CONFIG_VP9_POSTPROC),yes) | ifeq ($(CONFIG_VP9_POSTPROC),yes) | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm | VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm | VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm | ||||||
| endif | endif | ||||||
|  |  | ||||||
| ifeq ($(CONFIG_USE_X86INC),yes) |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm |  | ||||||
| endif |  | ||||||
|  |  | ||||||
| ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm |  | ||||||
| endif |  | ||||||
|  |  | ||||||
| # common (c) | # common (c) | ||||||
| VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h | VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h | ||||||
| VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c | VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c | ||||||
| @@ -113,15 +94,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c | |||||||
| endif | endif | ||||||
|  |  | ||||||
| # common (msa) | # common (msa) | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h |  | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c | VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c | VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c | ||||||
| VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c | VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c | ||||||
| @@ -151,11 +123,6 @@ endif | |||||||
| # neon with assembly and intrinsics implementations. If both are available | # neon with assembly and intrinsics implementations. If both are available | ||||||
| # prefer assembly. | # prefer assembly. | ||||||
| ifeq ($(HAVE_NEON_ASM), yes) | ifeq ($(HAVE_NEON_ASM), yes) | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM) |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM) |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM) |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM) |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM) | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM) | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM) | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM) | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c | ||||||
| @@ -167,11 +134,6 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) | |||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) | ||||||
| else | else | ||||||
| ifeq ($(HAVE_NEON), yes) | ifeq ($(HAVE_NEON), yes) | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c |  | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c | ||||||
| VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c | VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c | ||||||
|   | |||||||
							
								
								
									
										393
									
								
								vpx_dsp/arm/vpx_convolve8_avg_neon.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										393
									
								
								vpx_dsp/arm/vpx_convolve8_avg_neon.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,393 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. | ||||||
|  |  * | ||||||
|  |  *  Use of this source code is governed by a BSD-style license | ||||||
|  |  *  that can be found in the LICENSE file in the root of the source | ||||||
|  |  *  tree. An additional intellectual property rights grant can be found | ||||||
|  |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #include <arm_neon.h> | ||||||
|  |  | ||||||
|  | #include "./vpx_config.h" | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx/vpx_integer.h" | ||||||
|  | #include "vpx_ports/mem.h" | ||||||
|  |  | ||||||
|  | void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|  |                                uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|  |                                const int16_t *filter_x, int x_step_q4, | ||||||
|  |                                const int16_t *filter_y, int y_step_q4, | ||||||
|  |                                int w, int h); | ||||||
|  | void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|  |                                uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|  |                                const int16_t *filter_x, int x_step_q4, | ||||||
|  |                                const int16_t *filter_y, int y_step_q4, | ||||||
|  |                                int w, int h); | ||||||
|  |  | ||||||
|  | static INLINE int32x4_t MULTIPLY_BY_Q0( | ||||||
|  |     int16x4_t dsrc0, | ||||||
|  |     int16x4_t dsrc1, | ||||||
|  |     int16x4_t dsrc2, | ||||||
|  |     int16x4_t dsrc3, | ||||||
|  |     int16x4_t dsrc4, | ||||||
|  |     int16x4_t dsrc5, | ||||||
|  |     int16x4_t dsrc6, | ||||||
|  |     int16x4_t dsrc7, | ||||||
|  |     int16x8_t q0s16) { | ||||||
|  |   int32x4_t qdst; | ||||||
|  |   int16x4_t d0s16, d1s16; | ||||||
|  |  | ||||||
|  |   d0s16 = vget_low_s16(q0s16); | ||||||
|  |   d1s16 = vget_high_s16(q0s16); | ||||||
|  |  | ||||||
|  |   qdst = vmull_lane_s16(dsrc0, d0s16, 0); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); | ||||||
|  |   return qdst; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void vpx_convolve8_avg_horiz_neon( | ||||||
|  |     const uint8_t *src, | ||||||
|  |     ptrdiff_t src_stride, | ||||||
|  |     uint8_t *dst, | ||||||
|  |     ptrdiff_t dst_stride, | ||||||
|  |     const int16_t *filter_x, | ||||||
|  |     int x_step_q4, | ||||||
|  |     const int16_t *filter_y,  // unused | ||||||
|  |     int y_step_q4,            // unused | ||||||
|  |     int w, | ||||||
|  |     int h) { | ||||||
|  |   int width; | ||||||
|  |   const uint8_t *s; | ||||||
|  |   uint8_t *d; | ||||||
|  |   uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; | ||||||
|  |   uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; | ||||||
|  |   uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; | ||||||
|  |   int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; | ||||||
|  |   int16x4_t d24s16, d25s16, d26s16, d27s16; | ||||||
|  |   uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; | ||||||
|  |   int16x8_t q0s16; | ||||||
|  |   uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | ||||||
|  |   int32x4_t q1s32, q2s32, q14s32, q15s32; | ||||||
|  |   uint16x8x2_t q0x2u16; | ||||||
|  |   uint8x8x2_t d0x2u8, d1x2u8; | ||||||
|  |   uint32x2x2_t d0x2u32; | ||||||
|  |   uint16x4x2_t d0x2u16, d1x2u16; | ||||||
|  |   uint32x4x2_t q0x2u32; | ||||||
|  |  | ||||||
|  |   if (x_step_q4 != 16) { | ||||||
|  |     vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|  |                               filter_x, x_step_q4, | ||||||
|  |                               filter_y, y_step_q4, w, h); | ||||||
|  |     return; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |   q0s16 = vld1q_s16(filter_x); | ||||||
|  |  | ||||||
|  |   src -= 3;  // adjust for taps | ||||||
|  |   for (; h > 0; h -= 4) {  // loop_horiz_v | ||||||
|  |     s = src; | ||||||
|  |     d24u8 = vld1_u8(s); | ||||||
|  |     s += src_stride; | ||||||
|  |     d25u8 = vld1_u8(s); | ||||||
|  |     s += src_stride; | ||||||
|  |     d26u8 = vld1_u8(s); | ||||||
|  |     s += src_stride; | ||||||
|  |     d27u8 = vld1_u8(s); | ||||||
|  |  | ||||||
|  |     q12u8 = vcombine_u8(d24u8, d25u8); | ||||||
|  |     q13u8 = vcombine_u8(d26u8, d27u8); | ||||||
|  |  | ||||||
|  |     q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), | ||||||
|  |                         vreinterpretq_u16_u8(q13u8)); | ||||||
|  |     d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); | ||||||
|  |     d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); | ||||||
|  |     d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); | ||||||
|  |     d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); | ||||||
|  |     d0x2u8 = vtrn_u8(d24u8, d25u8); | ||||||
|  |     d1x2u8 = vtrn_u8(d26u8, d27u8); | ||||||
|  |  | ||||||
|  |     __builtin_prefetch(src + src_stride * 4); | ||||||
|  |     __builtin_prefetch(src + src_stride * 5); | ||||||
|  |  | ||||||
|  |     q8u16 = vmovl_u8(d0x2u8.val[0]); | ||||||
|  |     q9u16 = vmovl_u8(d0x2u8.val[1]); | ||||||
|  |     q10u16 = vmovl_u8(d1x2u8.val[0]); | ||||||
|  |     q11u16 = vmovl_u8(d1x2u8.val[1]); | ||||||
|  |  | ||||||
|  |     src += 7; | ||||||
|  |     d16u16 = vget_low_u16(q8u16); | ||||||
|  |     d17u16 = vget_high_u16(q8u16); | ||||||
|  |     d18u16 = vget_low_u16(q9u16); | ||||||
|  |     d19u16 = vget_high_u16(q9u16); | ||||||
|  |     q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18 | ||||||
|  |     q9u16 = vcombine_u16(d17u16, d19u16); | ||||||
|  |  | ||||||
|  |     d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); | ||||||
|  |     d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21 | ||||||
|  |     for (width = w; | ||||||
|  |          width > 0; | ||||||
|  |          width -= 4, src += 4, dst += 4) {  // loop_horiz | ||||||
|  |       s = src; | ||||||
|  |       d28u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |       s += src_stride; | ||||||
|  |       d29u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |       s += src_stride; | ||||||
|  |       d31u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |       s += src_stride; | ||||||
|  |       d30u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(src + 64); | ||||||
|  |  | ||||||
|  |       d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), | ||||||
|  |                          vreinterpret_u16_u32(d31u32)); | ||||||
|  |       d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), | ||||||
|  |                          vreinterpret_u16_u32(d30u32)); | ||||||
|  |       d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28 | ||||||
|  |                        vreinterpret_u8_u16(d1x2u16.val[0]));  // d29 | ||||||
|  |       d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31 | ||||||
|  |                        vreinterpret_u8_u16(d1x2u16.val[1]));  // d30 | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(src + 64 + src_stride); | ||||||
|  |  | ||||||
|  |       q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); | ||||||
|  |       q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); | ||||||
|  |       q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), | ||||||
|  |                           vreinterpretq_u32_u8(q15u8)); | ||||||
|  |  | ||||||
|  |       d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); | ||||||
|  |       d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); | ||||||
|  |       q12u16 = vmovl_u8(d28u8); | ||||||
|  |       q13u16 = vmovl_u8(d29u8); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(src + 64 + src_stride * 2); | ||||||
|  |  | ||||||
|  |       d = dst; | ||||||
|  |       d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); | ||||||
|  |  | ||||||
|  |       d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); | ||||||
|  |       d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); | ||||||
|  |       d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); | ||||||
|  |       d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); | ||||||
|  |       d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | ||||||
|  |       d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | ||||||
|  |       d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | ||||||
|  |       d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | ||||||
|  |       d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | ||||||
|  |  | ||||||
|  |       q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, | ||||||
|  |                               d18s16, d19s16, d23s16, d24s16, q0s16); | ||||||
|  |       q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, | ||||||
|  |                               d19s16, d23s16, d24s16, d26s16, q0s16); | ||||||
|  |       q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, | ||||||
|  |                               d23s16, d24s16, d26s16, d27s16, q0s16); | ||||||
|  |       q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, | ||||||
|  |                               d24s16, d26s16, d27s16, d25s16, q0s16); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(src + 64 + src_stride * 3); | ||||||
|  |  | ||||||
|  |       d2u16 = vqrshrun_n_s32(q1s32, 7); | ||||||
|  |       d3u16 = vqrshrun_n_s32(q2s32, 7); | ||||||
|  |       d4u16 = vqrshrun_n_s32(q14s32, 7); | ||||||
|  |       d5u16 = vqrshrun_n_s32(q15s32, 7); | ||||||
|  |  | ||||||
|  |       q1u16 = vcombine_u16(d2u16, d3u16); | ||||||
|  |       q2u16 = vcombine_u16(d4u16, d5u16); | ||||||
|  |  | ||||||
|  |       d2u8 = vqmovn_u16(q1u16); | ||||||
|  |       d3u8 = vqmovn_u16(q2u16); | ||||||
|  |  | ||||||
|  |       d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), | ||||||
|  |                          vreinterpret_u16_u8(d3u8)); | ||||||
|  |       d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), | ||||||
|  |                          vreinterpret_u32_u16(d0x2u16.val[1])); | ||||||
|  |       d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), | ||||||
|  |                        vreinterpret_u8_u32(d0x2u32.val[1])); | ||||||
|  |  | ||||||
|  |       q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); | ||||||
|  |       q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); | ||||||
|  |  | ||||||
|  |       q1u8 = vrhaddq_u8(q1u8, q3u8); | ||||||
|  |  | ||||||
|  |       d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); | ||||||
|  |       d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); | ||||||
|  |  | ||||||
|  |       d = dst; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 1); | ||||||
|  |  | ||||||
|  |       q8u16 = q9u16; | ||||||
|  |       d20s16 = d23s16; | ||||||
|  |       q11u16 = q12u16; | ||||||
|  |       q9u16 = q13u16; | ||||||
|  |       d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | ||||||
|  |     } | ||||||
|  |     src += src_stride * 4 - w - 7; | ||||||
|  |     dst += dst_stride * 4 - w; | ||||||
|  |   } | ||||||
|  |   return; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void vpx_convolve8_avg_vert_neon( | ||||||
|  |     const uint8_t *src, | ||||||
|  |     ptrdiff_t src_stride, | ||||||
|  |     uint8_t *dst, | ||||||
|  |     ptrdiff_t dst_stride, | ||||||
|  |     const int16_t *filter_x,  // unused | ||||||
|  |     int x_step_q4,            // unused | ||||||
|  |     const int16_t *filter_y, | ||||||
|  |     int y_step_q4, | ||||||
|  |     int w, | ||||||
|  |     int h) { | ||||||
|  |   int height; | ||||||
|  |   const uint8_t *s; | ||||||
|  |   uint8_t *d; | ||||||
|  |   uint8x8_t d2u8, d3u8; | ||||||
|  |   uint32x2_t d2u32, d3u32, d6u32, d7u32; | ||||||
|  |   uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; | ||||||
|  |   uint8x16_t q1u8, q3u8; | ||||||
|  |   int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; | ||||||
|  |   int16x4_t d24s16, d25s16, d26s16, d27s16; | ||||||
|  |   uint16x4_t d2u16, d3u16, d4u16, d5u16; | ||||||
|  |   int16x8_t q0s16; | ||||||
|  |   uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | ||||||
|  |   int32x4_t q1s32, q2s32, q14s32, q15s32; | ||||||
|  |  | ||||||
|  |   if (y_step_q4 != 16) { | ||||||
|  |     vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|  |                              filter_x, x_step_q4, | ||||||
|  |                              filter_y, y_step_q4, w, h); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   src -= src_stride * 3; | ||||||
|  |   q0s16 = vld1q_s16(filter_y); | ||||||
|  |   for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h | ||||||
|  |     s = src; | ||||||
|  |     d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |     d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); | ||||||
|  |     s += src_stride; | ||||||
|  |     d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |     d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); | ||||||
|  |     s += src_stride; | ||||||
|  |     d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |     d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); | ||||||
|  |     s += src_stride; | ||||||
|  |     d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |  | ||||||
|  |     q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32)); | ||||||
|  |     q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32)); | ||||||
|  |     q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); | ||||||
|  |     q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); | ||||||
|  |  | ||||||
|  |     d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); | ||||||
|  |     d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); | ||||||
|  |     d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | ||||||
|  |     d = dst; | ||||||
|  |     for (height = h; height > 0; height -= 4) {  // loop_vert | ||||||
|  |       d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); | ||||||
|  |       s += src_stride; | ||||||
|  |       d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); | ||||||
|  |       s += src_stride; | ||||||
|  |       d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); | ||||||
|  |       s += src_stride; | ||||||
|  |       d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); | ||||||
|  |       s += src_stride; | ||||||
|  |  | ||||||
|  |       q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); | ||||||
|  |       q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); | ||||||
|  |  | ||||||
|  |       d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); | ||||||
|  |       d -= dst_stride * 3; | ||||||
|  |  | ||||||
|  |       d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); | ||||||
|  |       d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); | ||||||
|  |       d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); | ||||||
|  |       d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); | ||||||
|  |       d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | ||||||
|  |       d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | ||||||
|  |       d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | ||||||
|  |       d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(s); | ||||||
|  |       __builtin_prefetch(s + src_stride); | ||||||
|  |       q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, | ||||||
|  |                               d20s16, d21s16, d22s16, d24s16, q0s16); | ||||||
|  |       __builtin_prefetch(s + src_stride * 2); | ||||||
|  |       __builtin_prefetch(s + src_stride * 3); | ||||||
|  |       q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, | ||||||
|  |                               d21s16, d22s16, d24s16, d26s16, q0s16); | ||||||
|  |       __builtin_prefetch(d); | ||||||
|  |       __builtin_prefetch(d + dst_stride); | ||||||
|  |       q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, | ||||||
|  |                               d22s16, d24s16, d26s16, d27s16, q0s16); | ||||||
|  |       __builtin_prefetch(d + dst_stride * 2); | ||||||
|  |       __builtin_prefetch(d + dst_stride * 3); | ||||||
|  |       q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, | ||||||
|  |                               d24s16, d26s16, d27s16, d25s16, q0s16); | ||||||
|  |  | ||||||
|  |       d2u16 = vqrshrun_n_s32(q1s32, 7); | ||||||
|  |       d3u16 = vqrshrun_n_s32(q2s32, 7); | ||||||
|  |       d4u16 = vqrshrun_n_s32(q14s32, 7); | ||||||
|  |       d5u16 = vqrshrun_n_s32(q15s32, 7); | ||||||
|  |  | ||||||
|  |       q1u16 = vcombine_u16(d2u16, d3u16); | ||||||
|  |       q2u16 = vcombine_u16(d4u16, d5u16); | ||||||
|  |  | ||||||
|  |       d2u8 = vqmovn_u16(q1u16); | ||||||
|  |       d3u8 = vqmovn_u16(q2u16); | ||||||
|  |  | ||||||
|  |       q1u8 = vcombine_u8(d2u8, d3u8); | ||||||
|  |       q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); | ||||||
|  |  | ||||||
|  |       q1u8 = vrhaddq_u8(q1u8, q3u8); | ||||||
|  |  | ||||||
|  |       d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); | ||||||
|  |       d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); | ||||||
|  |  | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       q8u16 = q10u16; | ||||||
|  |       d18s16 = d22s16; | ||||||
|  |       d19s16 = d24s16; | ||||||
|  |       q10u16 = q13u16; | ||||||
|  |       d22s16 = d25s16; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return; | ||||||
|  | } | ||||||
| @@ -17,10 +17,10 @@ | |||||||
|     ; VP9_FILTER_WEIGHT == 128 |     ; VP9_FILTER_WEIGHT == 128 | ||||||
|     ; VP9_FILTER_SHIFT == 7 |     ; VP9_FILTER_SHIFT == 7 | ||||||
| 
 | 
 | ||||||
|     EXPORT  |vp9_convolve8_avg_horiz_neon| |     EXPORT  |vpx_convolve8_avg_horiz_neon| | ||||||
|     EXPORT  |vp9_convolve8_avg_vert_neon| |     EXPORT  |vpx_convolve8_avg_vert_neon| | ||||||
|     IMPORT  |vp9_convolve8_avg_horiz_c| |     IMPORT  |vpx_convolve8_avg_horiz_c| | ||||||
|     IMPORT  |vp9_convolve8_avg_vert_c| |     IMPORT  |vpx_convolve8_avg_vert_c| | ||||||
|     ARM |     ARM | ||||||
|     REQUIRE8 |     REQUIRE8 | ||||||
|     PRESERVE8 |     PRESERVE8 | ||||||
| @@ -51,10 +51,10 @@ | |||||||
| ; sp[]int w | ; sp[]int w | ||||||
| ; sp[]int h | ; sp[]int h | ||||||
| 
 | 
 | ||||||
| |vp9_convolve8_avg_horiz_neon| PROC | |vpx_convolve8_avg_horiz_neon| PROC | ||||||
|     ldr             r12, [sp, #4]           ; x_step_q4 |     ldr             r12, [sp, #4]           ; x_step_q4 | ||||||
|     cmp             r12, #16 |     cmp             r12, #16 | ||||||
|     bne             vp9_convolve8_avg_horiz_c |     bne             vpx_convolve8_avg_horiz_c | ||||||
| 
 | 
 | ||||||
|     push            {r4-r10, lr} |     push            {r4-r10, lr} | ||||||
| 
 | 
 | ||||||
| @@ -78,7 +78,7 @@ | |||||||
| 
 | 
 | ||||||
|     mov             r10, r6                 ; w loop counter |     mov             r10, r6                 ; w loop counter | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_avg_loop_horiz_v | vpx_convolve8_avg_loop_horiz_v | ||||||
|     vld1.8          {d24}, [r0], r1 |     vld1.8          {d24}, [r0], r1 | ||||||
|     vld1.8          {d25}, [r0], r1 |     vld1.8          {d25}, [r0], r1 | ||||||
|     vld1.8          {d26}, [r0], r1 |     vld1.8          {d26}, [r0], r1 | ||||||
| @@ -101,7 +101,7 @@ vp9_convolve8_avg_loop_horiz_v | |||||||
| 
 | 
 | ||||||
|     add             r0, r0, #3 |     add             r0, r0, #3 | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_avg_loop_horiz | vpx_convolve8_avg_loop_horiz | ||||||
|     add             r5, r0, #64 |     add             r5, r0, #64 | ||||||
| 
 | 
 | ||||||
|     vld1.32         {d28[]}, [r0], r1 |     vld1.32         {d28[]}, [r0], r1 | ||||||
| @@ -170,23 +170,23 @@ vp9_convolve8_avg_loop_horiz | |||||||
|     vmov            q9,  q13 |     vmov            q9,  q13 | ||||||
| 
 | 
 | ||||||
|     subs            r6, r6, #4              ; w -= 4 |     subs            r6, r6, #4              ; w -= 4 | ||||||
|     bgt             vp9_convolve8_avg_loop_horiz |     bgt             vpx_convolve8_avg_loop_horiz | ||||||
| 
 | 
 | ||||||
|     ; outer loop |     ; outer loop | ||||||
|     mov             r6, r10                 ; restore w counter |     mov             r6, r10                 ; restore w counter | ||||||
|     add             r0, r0, r9              ; src += src_stride * 4 - w |     add             r0, r0, r9              ; src += src_stride * 4 - w | ||||||
|     add             r2, r2, r12             ; dst += dst_stride * 4 - w |     add             r2, r2, r12             ; dst += dst_stride * 4 - w | ||||||
|     subs            r7, r7, #4              ; h -= 4 |     subs            r7, r7, #4              ; h -= 4 | ||||||
|     bgt vp9_convolve8_avg_loop_horiz_v |     bgt vpx_convolve8_avg_loop_horiz_v | ||||||
| 
 | 
 | ||||||
|     pop             {r4-r10, pc} |     pop             {r4-r10, pc} | ||||||
| 
 | 
 | ||||||
|     ENDP |     ENDP | ||||||
| 
 | 
 | ||||||
| |vp9_convolve8_avg_vert_neon| PROC | |vpx_convolve8_avg_vert_neon| PROC | ||||||
|     ldr             r12, [sp, #12] |     ldr             r12, [sp, #12] | ||||||
|     cmp             r12, #16 |     cmp             r12, #16 | ||||||
|     bne             vp9_convolve8_avg_vert_c |     bne             vpx_convolve8_avg_vert_c | ||||||
| 
 | 
 | ||||||
|     push            {r4-r8, lr} |     push            {r4-r8, lr} | ||||||
| 
 | 
 | ||||||
| @@ -203,7 +203,7 @@ vp9_convolve8_avg_loop_horiz | |||||||
|     lsl             r1, r1, #1 |     lsl             r1, r1, #1 | ||||||
|     lsl             r3, r3, #1 |     lsl             r3, r3, #1 | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_avg_loop_vert_h | vpx_convolve8_avg_loop_vert_h | ||||||
|     mov             r4, r0 |     mov             r4, r0 | ||||||
|     add             r7, r0, r1, asr #1 |     add             r7, r0, r1, asr #1 | ||||||
|     mov             r5, r2 |     mov             r5, r2 | ||||||
| @@ -223,7 +223,7 @@ vp9_convolve8_avg_loop_vert_h | |||||||
|     vmovl.u8        q10, d20 |     vmovl.u8        q10, d20 | ||||||
|     vmovl.u8        q11, d22 |     vmovl.u8        q11, d22 | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_avg_loop_vert | vpx_convolve8_avg_loop_vert | ||||||
|     ; always process a 4x4 block at a time |     ; always process a 4x4 block at a time | ||||||
|     vld1.u32        {d24[0]}, [r7], r1 |     vld1.u32        {d24[0]}, [r7], r1 | ||||||
|     vld1.u32        {d26[0]}, [r4], r1 |     vld1.u32        {d26[0]}, [r4], r1 | ||||||
| @@ -288,13 +288,13 @@ vp9_convolve8_avg_loop_vert | |||||||
|     vmov            d22, d25 |     vmov            d22, d25 | ||||||
| 
 | 
 | ||||||
|     subs            r12, r12, #4            ; h -= 4 |     subs            r12, r12, #4            ; h -= 4 | ||||||
|     bgt             vp9_convolve8_avg_loop_vert |     bgt             vpx_convolve8_avg_loop_vert | ||||||
| 
 | 
 | ||||||
|     ; outer loop |     ; outer loop | ||||||
|     add             r0, r0, #4 |     add             r0, r0, #4 | ||||||
|     add             r2, r2, #4 |     add             r2, r2, #4 | ||||||
|     subs            r6, r6, #4              ; w -= 4 |     subs            r6, r6, #4              ; w -= 4 | ||||||
|     bgt             vp9_convolve8_avg_loop_vert_h |     bgt             vpx_convolve8_avg_loop_vert_h | ||||||
| 
 | 
 | ||||||
|     pop             {r4-r8, pc} |     pop             {r4-r8, pc} | ||||||
| 
 | 
 | ||||||
							
								
								
									
										360
									
								
								vpx_dsp/arm/vpx_convolve8_neon.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										360
									
								
								vpx_dsp/arm/vpx_convolve8_neon.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,360 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. | ||||||
|  |  * | ||||||
|  |  *  Use of this source code is governed by a BSD-style license | ||||||
|  |  *  that can be found in the LICENSE file in the root of the source | ||||||
|  |  *  tree. An additional intellectual property rights grant can be found | ||||||
|  |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #include <arm_neon.h> | ||||||
|  |  | ||||||
|  | #include "./vpx_config.h" | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx/vpx_integer.h" | ||||||
|  | #include "vpx_ports/mem.h" | ||||||
|  |  | ||||||
|  | void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|  |                            uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|  |                            const int16_t *filter_x, int x_step_q4, | ||||||
|  |                            const int16_t *filter_y, int y_step_q4, | ||||||
|  |                            int w, int h); | ||||||
|  | void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|  |                            uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|  |                            const int16_t *filter_x, int x_step_q4, | ||||||
|  |                            const int16_t *filter_y, int y_step_q4, | ||||||
|  |                            int w, int h); | ||||||
|  |  | ||||||
|  | static INLINE int32x4_t MULTIPLY_BY_Q0( | ||||||
|  |     int16x4_t dsrc0, | ||||||
|  |     int16x4_t dsrc1, | ||||||
|  |     int16x4_t dsrc2, | ||||||
|  |     int16x4_t dsrc3, | ||||||
|  |     int16x4_t dsrc4, | ||||||
|  |     int16x4_t dsrc5, | ||||||
|  |     int16x4_t dsrc6, | ||||||
|  |     int16x4_t dsrc7, | ||||||
|  |     int16x8_t q0s16) { | ||||||
|  |   int32x4_t qdst; | ||||||
|  |   int16x4_t d0s16, d1s16; | ||||||
|  |  | ||||||
|  |   d0s16 = vget_low_s16(q0s16); | ||||||
|  |   d1s16 = vget_high_s16(q0s16); | ||||||
|  |  | ||||||
|  |   qdst = vmull_lane_s16(dsrc0, d0s16, 0); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); | ||||||
|  |   qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); | ||||||
|  |   return qdst; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void vpx_convolve8_horiz_neon( | ||||||
|  |     const uint8_t *src, | ||||||
|  |     ptrdiff_t src_stride, | ||||||
|  |     uint8_t *dst, | ||||||
|  |     ptrdiff_t dst_stride, | ||||||
|  |     const int16_t *filter_x, | ||||||
|  |     int x_step_q4, | ||||||
|  |     const int16_t *filter_y,  // unused | ||||||
|  |     int y_step_q4,            // unused | ||||||
|  |     int w, | ||||||
|  |     int h) { | ||||||
|  |   int width; | ||||||
|  |   const uint8_t *s, *psrc; | ||||||
|  |   uint8_t *d, *pdst; | ||||||
|  |   uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; | ||||||
|  |   uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; | ||||||
|  |   uint8x16_t q12u8, q13u8, q14u8, q15u8; | ||||||
|  |   int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; | ||||||
|  |   int16x4_t d24s16, d25s16, d26s16, d27s16; | ||||||
|  |   uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; | ||||||
|  |   int16x8_t q0s16; | ||||||
|  |   uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | ||||||
|  |   int32x4_t q1s32, q2s32, q14s32, q15s32; | ||||||
|  |   uint16x8x2_t q0x2u16; | ||||||
|  |   uint8x8x2_t d0x2u8, d1x2u8; | ||||||
|  |   uint32x2x2_t d0x2u32; | ||||||
|  |   uint16x4x2_t d0x2u16, d1x2u16; | ||||||
|  |   uint32x4x2_t q0x2u32; | ||||||
|  |  | ||||||
|  |   if (x_step_q4 != 16) { | ||||||
|  |     vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|  |                           filter_x, x_step_q4, | ||||||
|  |                           filter_y, y_step_q4, w, h); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   q0s16 = vld1q_s16(filter_x); | ||||||
|  |  | ||||||
|  |   src -= 3;  // adjust for taps | ||||||
|  |   for (; h > 0; h -= 4, | ||||||
|  |     src += src_stride * 4, | ||||||
|  |     dst += dst_stride * 4) {  // loop_horiz_v | ||||||
|  |     s = src; | ||||||
|  |     d24u8 = vld1_u8(s); | ||||||
|  |     s += src_stride; | ||||||
|  |     d25u8 = vld1_u8(s); | ||||||
|  |     s += src_stride; | ||||||
|  |     d26u8 = vld1_u8(s); | ||||||
|  |     s += src_stride; | ||||||
|  |     d27u8 = vld1_u8(s); | ||||||
|  |  | ||||||
|  |     q12u8 = vcombine_u8(d24u8, d25u8); | ||||||
|  |     q13u8 = vcombine_u8(d26u8, d27u8); | ||||||
|  |  | ||||||
|  |     q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), | ||||||
|  |                         vreinterpretq_u16_u8(q13u8)); | ||||||
|  |     d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); | ||||||
|  |     d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); | ||||||
|  |     d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); | ||||||
|  |     d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); | ||||||
|  |     d0x2u8 = vtrn_u8(d24u8, d25u8); | ||||||
|  |     d1x2u8 = vtrn_u8(d26u8, d27u8); | ||||||
|  |  | ||||||
|  |     __builtin_prefetch(src + src_stride * 4); | ||||||
|  |     __builtin_prefetch(src + src_stride * 5); | ||||||
|  |     __builtin_prefetch(src + src_stride * 6); | ||||||
|  |  | ||||||
|  |     q8u16  = vmovl_u8(d0x2u8.val[0]); | ||||||
|  |     q9u16  = vmovl_u8(d0x2u8.val[1]); | ||||||
|  |     q10u16 = vmovl_u8(d1x2u8.val[0]); | ||||||
|  |     q11u16 = vmovl_u8(d1x2u8.val[1]); | ||||||
|  |  | ||||||
|  |     d16u16 = vget_low_u16(q8u16); | ||||||
|  |     d17u16 = vget_high_u16(q8u16); | ||||||
|  |     d18u16 = vget_low_u16(q9u16); | ||||||
|  |     d19u16 = vget_high_u16(q9u16); | ||||||
|  |     q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18 | ||||||
|  |     q9u16 = vcombine_u16(d17u16, d19u16); | ||||||
|  |  | ||||||
|  |     d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); | ||||||
|  |     d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21 | ||||||
|  |     for (width = w, psrc = src + 7, pdst = dst; | ||||||
|  |          width > 0; | ||||||
|  |          width -= 4, psrc += 4, pdst += 4) {  // loop_horiz | ||||||
|  |       s = psrc; | ||||||
|  |       d28u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |       s += src_stride; | ||||||
|  |       d29u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |       s += src_stride; | ||||||
|  |       d31u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |       s += src_stride; | ||||||
|  |       d30u32 = vld1_dup_u32((const uint32_t *)s); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(psrc + 64); | ||||||
|  |  | ||||||
|  |       d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), | ||||||
|  |                          vreinterpret_u16_u32(d31u32)); | ||||||
|  |       d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), | ||||||
|  |                          vreinterpret_u16_u32(d30u32)); | ||||||
|  |       d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28 | ||||||
|  |                        vreinterpret_u8_u16(d1x2u16.val[0]));  // d29 | ||||||
|  |       d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31 | ||||||
|  |                        vreinterpret_u8_u16(d1x2u16.val[1]));  // d30 | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(psrc + 64 + src_stride); | ||||||
|  |  | ||||||
|  |       q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); | ||||||
|  |       q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); | ||||||
|  |       q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), | ||||||
|  |                           vreinterpretq_u32_u8(q15u8)); | ||||||
|  |  | ||||||
|  |       d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); | ||||||
|  |       d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); | ||||||
|  |       q12u16 = vmovl_u8(d28u8); | ||||||
|  |       q13u16 = vmovl_u8(d29u8); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(psrc + 64 + src_stride * 2); | ||||||
|  |  | ||||||
|  |       d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); | ||||||
|  |       d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); | ||||||
|  |       d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); | ||||||
|  |       d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); | ||||||
|  |       d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | ||||||
|  |       d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | ||||||
|  |       d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | ||||||
|  |       d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | ||||||
|  |       d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | ||||||
|  |  | ||||||
|  |       q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, | ||||||
|  |                               d18s16, d19s16, d23s16, d24s16, q0s16); | ||||||
|  |       q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, | ||||||
|  |                               d19s16, d23s16, d24s16, d26s16, q0s16); | ||||||
|  |       q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, | ||||||
|  |                               d23s16, d24s16, d26s16, d27s16, q0s16); | ||||||
|  |       q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, | ||||||
|  |                               d24s16, d26s16, d27s16, d25s16, q0s16); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(psrc + 60 + src_stride * 3); | ||||||
|  |  | ||||||
|  |       d2u16 = vqrshrun_n_s32(q1s32, 7); | ||||||
|  |       d3u16 = vqrshrun_n_s32(q2s32, 7); | ||||||
|  |       d4u16 = vqrshrun_n_s32(q14s32, 7); | ||||||
|  |       d5u16 = vqrshrun_n_s32(q15s32, 7); | ||||||
|  |  | ||||||
|  |       q1u16 = vcombine_u16(d2u16, d3u16); | ||||||
|  |       q2u16 = vcombine_u16(d4u16, d5u16); | ||||||
|  |  | ||||||
|  |       d2u8 = vqmovn_u16(q1u16); | ||||||
|  |       d3u8 = vqmovn_u16(q2u16); | ||||||
|  |  | ||||||
|  |       d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), | ||||||
|  |                          vreinterpret_u16_u8(d3u8)); | ||||||
|  |       d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), | ||||||
|  |                          vreinterpret_u32_u16(d0x2u16.val[1])); | ||||||
|  |       d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), | ||||||
|  |                        vreinterpret_u8_u32(d0x2u32.val[1])); | ||||||
|  |  | ||||||
|  |       d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); | ||||||
|  |       d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); | ||||||
|  |  | ||||||
|  |       d = pdst; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 1); | ||||||
|  |  | ||||||
|  |       q8u16 = q9u16; | ||||||
|  |       d20s16 = d23s16; | ||||||
|  |       q11u16 = q12u16; | ||||||
|  |       q9u16 = q13u16; | ||||||
|  |       d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void vpx_convolve8_vert_neon( | ||||||
|  |     const uint8_t *src, | ||||||
|  |     ptrdiff_t src_stride, | ||||||
|  |     uint8_t *dst, | ||||||
|  |     ptrdiff_t dst_stride, | ||||||
|  |     const int16_t *filter_x,  // unused | ||||||
|  |     int x_step_q4,            // unused | ||||||
|  |     const int16_t *filter_y, | ||||||
|  |     int y_step_q4, | ||||||
|  |     int w, | ||||||
|  |     int h) { | ||||||
|  |   int height; | ||||||
|  |   const uint8_t *s; | ||||||
|  |   uint8_t *d; | ||||||
|  |   uint32x2_t d2u32, d3u32; | ||||||
|  |   uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; | ||||||
|  |   int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; | ||||||
|  |   int16x4_t d24s16, d25s16, d26s16, d27s16; | ||||||
|  |   uint16x4_t d2u16, d3u16, d4u16, d5u16; | ||||||
|  |   int16x8_t q0s16; | ||||||
|  |   uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | ||||||
|  |   int32x4_t q1s32, q2s32, q14s32, q15s32; | ||||||
|  |  | ||||||
|  |   if (y_step_q4 != 16) { | ||||||
|  |     vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|  |                          filter_x, x_step_q4, | ||||||
|  |                          filter_y, y_step_q4, w, h); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   src -= src_stride * 3; | ||||||
|  |   q0s16 = vld1q_s16(filter_y); | ||||||
|  |   for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h | ||||||
|  |     s = src; | ||||||
|  |     d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |     d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); | ||||||
|  |     s += src_stride; | ||||||
|  |     d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |     d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); | ||||||
|  |     s += src_stride; | ||||||
|  |     d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |     d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); | ||||||
|  |     s += src_stride; | ||||||
|  |     d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); | ||||||
|  |     s += src_stride; | ||||||
|  |  | ||||||
|  |     q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32)); | ||||||
|  |     q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32)); | ||||||
|  |     q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); | ||||||
|  |     q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); | ||||||
|  |  | ||||||
|  |     d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); | ||||||
|  |     d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); | ||||||
|  |     d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | ||||||
|  |     d = dst; | ||||||
|  |     for (height = h; height > 0; height -= 4) {  // loop_vert | ||||||
|  |       d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); | ||||||
|  |       s += src_stride; | ||||||
|  |       d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); | ||||||
|  |       s += src_stride; | ||||||
|  |       d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); | ||||||
|  |       s += src_stride; | ||||||
|  |       d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); | ||||||
|  |       s += src_stride; | ||||||
|  |  | ||||||
|  |       q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); | ||||||
|  |       q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); | ||||||
|  |  | ||||||
|  |       d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); | ||||||
|  |       d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); | ||||||
|  |       d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); | ||||||
|  |       d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); | ||||||
|  |       d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | ||||||
|  |       d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | ||||||
|  |       d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | ||||||
|  |       d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | ||||||
|  |  | ||||||
|  |       __builtin_prefetch(d); | ||||||
|  |       __builtin_prefetch(d + dst_stride); | ||||||
|  |       q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, | ||||||
|  |                               d20s16, d21s16, d22s16, d24s16, q0s16); | ||||||
|  |       __builtin_prefetch(d + dst_stride * 2); | ||||||
|  |       __builtin_prefetch(d + dst_stride * 3); | ||||||
|  |       q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, | ||||||
|  |                               d21s16, d22s16, d24s16, d26s16, q0s16); | ||||||
|  |       __builtin_prefetch(s); | ||||||
|  |       __builtin_prefetch(s + src_stride); | ||||||
|  |       q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, | ||||||
|  |                               d22s16, d24s16, d26s16, d27s16, q0s16); | ||||||
|  |       __builtin_prefetch(s + src_stride * 2); | ||||||
|  |       __builtin_prefetch(s + src_stride * 3); | ||||||
|  |       q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, | ||||||
|  |                               d24s16, d26s16, d27s16, d25s16, q0s16); | ||||||
|  |  | ||||||
|  |       d2u16 = vqrshrun_n_s32(q1s32, 7); | ||||||
|  |       d3u16 = vqrshrun_n_s32(q2s32, 7); | ||||||
|  |       d4u16 = vqrshrun_n_s32(q14s32, 7); | ||||||
|  |       d5u16 = vqrshrun_n_s32(q15s32, 7); | ||||||
|  |  | ||||||
|  |       q1u16 = vcombine_u16(d2u16, d3u16); | ||||||
|  |       q2u16 = vcombine_u16(d4u16, d5u16); | ||||||
|  |  | ||||||
|  |       d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); | ||||||
|  |       d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); | ||||||
|  |  | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d2u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)d, d3u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       q8u16 = q10u16; | ||||||
|  |       d18s16 = d22s16; | ||||||
|  |       d19s16 = d24s16; | ||||||
|  |       q10u16 = q13u16; | ||||||
|  |       d22s16 = d25s16; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return; | ||||||
|  | } | ||||||
| @@ -17,10 +17,10 @@ | |||||||
|     ; VP9_FILTER_WEIGHT == 128 |     ; VP9_FILTER_WEIGHT == 128 | ||||||
|     ; VP9_FILTER_SHIFT == 7 |     ; VP9_FILTER_SHIFT == 7 | ||||||
| 
 | 
 | ||||||
|     EXPORT  |vp9_convolve8_horiz_neon| |     EXPORT  |vpx_convolve8_horiz_neon| | ||||||
|     EXPORT  |vp9_convolve8_vert_neon| |     EXPORT  |vpx_convolve8_vert_neon| | ||||||
|     IMPORT  |vp9_convolve8_horiz_c| |     IMPORT  |vpx_convolve8_horiz_c| | ||||||
|     IMPORT  |vp9_convolve8_vert_c| |     IMPORT  |vpx_convolve8_vert_c| | ||||||
|     ARM |     ARM | ||||||
|     REQUIRE8 |     REQUIRE8 | ||||||
|     PRESERVE8 |     PRESERVE8 | ||||||
| @@ -51,10 +51,10 @@ | |||||||
| ; sp[]int w | ; sp[]int w | ||||||
| ; sp[]int h | ; sp[]int h | ||||||
| 
 | 
 | ||||||
| |vp9_convolve8_horiz_neon| PROC | |vpx_convolve8_horiz_neon| PROC | ||||||
|     ldr             r12, [sp, #4]           ; x_step_q4 |     ldr             r12, [sp, #4]           ; x_step_q4 | ||||||
|     cmp             r12, #16 |     cmp             r12, #16 | ||||||
|     bne             vp9_convolve8_horiz_c |     bne             vpx_convolve8_horiz_c | ||||||
| 
 | 
 | ||||||
|     push            {r4-r10, lr} |     push            {r4-r10, lr} | ||||||
| 
 | 
 | ||||||
| @@ -78,7 +78,7 @@ | |||||||
| 
 | 
 | ||||||
|     mov             r10, r6                 ; w loop counter |     mov             r10, r6                 ; w loop counter | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_loop_horiz_v | vpx_convolve8_loop_horiz_v | ||||||
|     vld1.8          {d24}, [r0], r1 |     vld1.8          {d24}, [r0], r1 | ||||||
|     vld1.8          {d25}, [r0], r1 |     vld1.8          {d25}, [r0], r1 | ||||||
|     vld1.8          {d26}, [r0], r1 |     vld1.8          {d26}, [r0], r1 | ||||||
| @@ -101,7 +101,7 @@ vp9_convolve8_loop_horiz_v | |||||||
| 
 | 
 | ||||||
|     add             r0, r0, #3 |     add             r0, r0, #3 | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_loop_horiz | vpx_convolve8_loop_horiz | ||||||
|     add             r5, r0, #64 |     add             r5, r0, #64 | ||||||
| 
 | 
 | ||||||
|     vld1.32         {d28[]}, [r0], r1 |     vld1.32         {d28[]}, [r0], r1 | ||||||
| @@ -159,23 +159,23 @@ vp9_convolve8_loop_horiz | |||||||
|     vmov            q9,  q13 |     vmov            q9,  q13 | ||||||
| 
 | 
 | ||||||
|     subs            r6, r6, #4              ; w -= 4 |     subs            r6, r6, #4              ; w -= 4 | ||||||
|     bgt             vp9_convolve8_loop_horiz |     bgt             vpx_convolve8_loop_horiz | ||||||
| 
 | 
 | ||||||
|     ; outer loop |     ; outer loop | ||||||
|     mov             r6, r10                 ; restore w counter |     mov             r6, r10                 ; restore w counter | ||||||
|     add             r0, r0, r9              ; src += src_stride * 4 - w |     add             r0, r0, r9              ; src += src_stride * 4 - w | ||||||
|     add             r2, r2, r12             ; dst += dst_stride * 4 - w |     add             r2, r2, r12             ; dst += dst_stride * 4 - w | ||||||
|     subs            r7, r7, #4              ; h -= 4 |     subs            r7, r7, #4              ; h -= 4 | ||||||
|     bgt vp9_convolve8_loop_horiz_v |     bgt vpx_convolve8_loop_horiz_v | ||||||
| 
 | 
 | ||||||
|     pop             {r4-r10, pc} |     pop             {r4-r10, pc} | ||||||
| 
 | 
 | ||||||
|     ENDP |     ENDP | ||||||
| 
 | 
 | ||||||
| |vp9_convolve8_vert_neon| PROC | |vpx_convolve8_vert_neon| PROC | ||||||
|     ldr             r12, [sp, #12] |     ldr             r12, [sp, #12] | ||||||
|     cmp             r12, #16 |     cmp             r12, #16 | ||||||
|     bne             vp9_convolve8_vert_c |     bne             vpx_convolve8_vert_c | ||||||
| 
 | 
 | ||||||
|     push            {r4-r8, lr} |     push            {r4-r8, lr} | ||||||
| 
 | 
 | ||||||
| @@ -192,7 +192,7 @@ vp9_convolve8_loop_horiz | |||||||
|     lsl             r1, r1, #1 |     lsl             r1, r1, #1 | ||||||
|     lsl             r3, r3, #1 |     lsl             r3, r3, #1 | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_loop_vert_h | vpx_convolve8_loop_vert_h | ||||||
|     mov             r4, r0 |     mov             r4, r0 | ||||||
|     add             r7, r0, r1, asr #1 |     add             r7, r0, r1, asr #1 | ||||||
|     mov             r5, r2 |     mov             r5, r2 | ||||||
| @@ -212,7 +212,7 @@ vp9_convolve8_loop_vert_h | |||||||
|     vmovl.u8        q10, d20 |     vmovl.u8        q10, d20 | ||||||
|     vmovl.u8        q11, d22 |     vmovl.u8        q11, d22 | ||||||
| 
 | 
 | ||||||
| vp9_convolve8_loop_vert | vpx_convolve8_loop_vert | ||||||
|     ; always process a 4x4 block at a time |     ; always process a 4x4 block at a time | ||||||
|     vld1.u32        {d24[0]}, [r7], r1 |     vld1.u32        {d24[0]}, [r7], r1 | ||||||
|     vld1.u32        {d26[0]}, [r4], r1 |     vld1.u32        {d26[0]}, [r4], r1 | ||||||
| @@ -266,13 +266,13 @@ vp9_convolve8_loop_vert | |||||||
|     vmov            d22, d25 |     vmov            d22, d25 | ||||||
| 
 | 
 | ||||||
|     subs            r12, r12, #4            ; h -= 4 |     subs            r12, r12, #4            ; h -= 4 | ||||||
|     bgt             vp9_convolve8_loop_vert |     bgt             vpx_convolve8_loop_vert | ||||||
| 
 | 
 | ||||||
|     ; outer loop |     ; outer loop | ||||||
|     add             r0, r0, #4 |     add             r0, r0, #4 | ||||||
|     add             r2, r2, #4 |     add             r2, r2, #4 | ||||||
|     subs            r6, r6, #4              ; w -= 4 |     subs            r6, r6, #4              ; w -= 4 | ||||||
|     bgt             vp9_convolve8_loop_vert_h |     bgt             vpx_convolve8_loop_vert_h | ||||||
| 
 | 
 | ||||||
|     pop             {r4-r8, pc} |     pop             {r4-r8, pc} | ||||||
| 
 | 
 | ||||||
							
								
								
									
										147
									
								
								vpx_dsp/arm/vpx_convolve_avg_neon.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								vpx_dsp/arm/vpx_convolve_avg_neon.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,147 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. | ||||||
|  |  * | ||||||
|  |  *  Use of this source code is governed by a BSD-style license | ||||||
|  |  *  that can be found in the LICENSE file in the root of the source | ||||||
|  |  *  tree. An additional intellectual property rights grant can be found | ||||||
|  |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #include <arm_neon.h> | ||||||
|  |  | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx/vpx_integer.h" | ||||||
|  |  | ||||||
|  | void vpx_convolve_avg_neon( | ||||||
|  |     const uint8_t *src,    // r0 | ||||||
|  |     ptrdiff_t src_stride,  // r1 | ||||||
|  |     uint8_t *dst,          // r2 | ||||||
|  |     ptrdiff_t dst_stride,  // r3 | ||||||
|  |     const int16_t *filter_x, | ||||||
|  |     int filter_x_stride, | ||||||
|  |     const int16_t *filter_y, | ||||||
|  |     int filter_y_stride, | ||||||
|  |     int w, | ||||||
|  |     int h) { | ||||||
|  |   uint8_t *d; | ||||||
|  |   uint8x8_t d0u8, d1u8, d2u8, d3u8; | ||||||
|  |   uint32x2_t d0u32, d2u32; | ||||||
|  |   uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; | ||||||
|  |   (void)filter_x;  (void)filter_x_stride; | ||||||
|  |   (void)filter_y;  (void)filter_y_stride; | ||||||
|  |  | ||||||
|  |   d = dst; | ||||||
|  |   if (w > 32) {  // avg64 | ||||||
|  |     for (; h > 0; h -= 1) { | ||||||
|  |       q0u8  = vld1q_u8(src); | ||||||
|  |       q1u8  = vld1q_u8(src + 16); | ||||||
|  |       q2u8  = vld1q_u8(src + 32); | ||||||
|  |       q3u8  = vld1q_u8(src + 48); | ||||||
|  |       src += src_stride; | ||||||
|  |       q8u8  = vld1q_u8(d); | ||||||
|  |       q9u8  = vld1q_u8(d + 16); | ||||||
|  |       q10u8 = vld1q_u8(d + 32); | ||||||
|  |       q11u8 = vld1q_u8(d + 48); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       q0u8 = vrhaddq_u8(q0u8, q8u8); | ||||||
|  |       q1u8 = vrhaddq_u8(q1u8, q9u8); | ||||||
|  |       q2u8 = vrhaddq_u8(q2u8, q10u8); | ||||||
|  |       q3u8 = vrhaddq_u8(q3u8, q11u8); | ||||||
|  |  | ||||||
|  |       vst1q_u8(dst, q0u8); | ||||||
|  |       vst1q_u8(dst + 16, q1u8); | ||||||
|  |       vst1q_u8(dst + 32, q2u8); | ||||||
|  |       vst1q_u8(dst + 48, q3u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else if (w == 32) {  // avg32 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       q0u8 = vld1q_u8(src); | ||||||
|  |       q1u8 = vld1q_u8(src + 16); | ||||||
|  |       src += src_stride; | ||||||
|  |       q2u8 = vld1q_u8(src); | ||||||
|  |       q3u8 = vld1q_u8(src + 16); | ||||||
|  |       src += src_stride; | ||||||
|  |       q8u8 = vld1q_u8(d); | ||||||
|  |       q9u8 = vld1q_u8(d + 16); | ||||||
|  |       d += dst_stride; | ||||||
|  |       q10u8 = vld1q_u8(d); | ||||||
|  |       q11u8 = vld1q_u8(d + 16); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       q0u8 = vrhaddq_u8(q0u8, q8u8); | ||||||
|  |       q1u8 = vrhaddq_u8(q1u8, q9u8); | ||||||
|  |       q2u8 = vrhaddq_u8(q2u8, q10u8); | ||||||
|  |       q3u8 = vrhaddq_u8(q3u8, q11u8); | ||||||
|  |  | ||||||
|  |       vst1q_u8(dst, q0u8); | ||||||
|  |       vst1q_u8(dst + 16, q1u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1q_u8(dst, q2u8); | ||||||
|  |       vst1q_u8(dst + 16, q3u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else if (w > 8) {  // avg16 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       q0u8 = vld1q_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |       q1u8 = vld1q_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |       q2u8 = vld1q_u8(d); | ||||||
|  |       d += dst_stride; | ||||||
|  |       q3u8 = vld1q_u8(d); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       q0u8 = vrhaddq_u8(q0u8, q2u8); | ||||||
|  |       q1u8 = vrhaddq_u8(q1u8, q3u8); | ||||||
|  |  | ||||||
|  |       vst1q_u8(dst, q0u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1q_u8(dst, q1u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else if (w == 8) {  // avg8 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       d0u8 = vld1_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |       d1u8 = vld1_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |       d2u8 = vld1_u8(d); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d3u8 = vld1_u8(d); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       q0u8 = vcombine_u8(d0u8, d1u8); | ||||||
|  |       q1u8 = vcombine_u8(d2u8, d3u8); | ||||||
|  |       q0u8 = vrhaddq_u8(q0u8, q1u8); | ||||||
|  |  | ||||||
|  |       vst1_u8(dst, vget_low_u8(q0u8)); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1_u8(dst, vget_high_u8(q0u8)); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else {  // avg4 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); | ||||||
|  |       src += src_stride; | ||||||
|  |       d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); | ||||||
|  |       src += src_stride; | ||||||
|  |       d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); | ||||||
|  |       d += dst_stride; | ||||||
|  |       d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); | ||||||
|  |       d += dst_stride; | ||||||
|  |  | ||||||
|  |       d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), | ||||||
|  |                        vreinterpret_u8_u32(d2u32)); | ||||||
|  |  | ||||||
|  |       d0u32 = vreinterpret_u32_u8(d0u8); | ||||||
|  |       vst1_lane_u32((uint32_t *)dst, d0u32, 0); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1_lane_u32((uint32_t *)dst, d0u32, 1); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return; | ||||||
|  | } | ||||||
| @@ -8,14 +8,14 @@ | |||||||
| ;  be found in the AUTHORS file in the root of the source tree. | ;  be found in the AUTHORS file in the root of the source tree. | ||||||
| ; | ; | ||||||
| 
 | 
 | ||||||
|     EXPORT  |vp9_convolve_avg_neon| |     EXPORT  |vpx_convolve_avg_neon| | ||||||
|     ARM |     ARM | ||||||
|     REQUIRE8 |     REQUIRE8 | ||||||
|     PRESERVE8 |     PRESERVE8 | ||||||
| 
 | 
 | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |     AREA ||.text||, CODE, READONLY, ALIGN=2 | ||||||
| 
 | 
 | ||||||
| |vp9_convolve_avg_neon| PROC | |vpx_convolve_avg_neon| PROC | ||||||
|     push                {r4-r6, lr} |     push                {r4-r6, lr} | ||||||
|     ldrd                r4, r5, [sp, #32] |     ldrd                r4, r5, [sp, #32] | ||||||
|     mov                 r6, r2 |     mov                 r6, r2 | ||||||
							
								
								
									
										94
									
								
								vpx_dsp/arm/vpx_convolve_copy_neon.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								vpx_dsp/arm/vpx_convolve_copy_neon.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,94 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. | ||||||
|  |  * | ||||||
|  |  *  Use of this source code is governed by a BSD-style license | ||||||
|  |  *  that can be found in the LICENSE file in the root of the source | ||||||
|  |  *  tree. An additional intellectual property rights grant can be found | ||||||
|  |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #include <arm_neon.h> | ||||||
|  |  | ||||||
|  | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx/vpx_integer.h" | ||||||
|  |  | ||||||
|  | void vpx_convolve_copy_neon( | ||||||
|  |     const uint8_t *src,    // r0 | ||||||
|  |     ptrdiff_t src_stride,  // r1 | ||||||
|  |     uint8_t *dst,          // r2 | ||||||
|  |     ptrdiff_t dst_stride,  // r3 | ||||||
|  |     const int16_t *filter_x, | ||||||
|  |     int filter_x_stride, | ||||||
|  |     const int16_t *filter_y, | ||||||
|  |     int filter_y_stride, | ||||||
|  |     int w, | ||||||
|  |     int h) { | ||||||
|  |   uint8x8_t d0u8, d2u8; | ||||||
|  |   uint8x16_t q0u8, q1u8, q2u8, q3u8; | ||||||
|  |   (void)filter_x;  (void)filter_x_stride; | ||||||
|  |   (void)filter_y;  (void)filter_y_stride; | ||||||
|  |  | ||||||
|  |   if (w > 32) {  // copy64 | ||||||
|  |     for (; h > 0; h--) { | ||||||
|  |       q0u8 = vld1q_u8(src); | ||||||
|  |       q1u8 = vld1q_u8(src + 16); | ||||||
|  |       q2u8 = vld1q_u8(src + 32); | ||||||
|  |       q3u8 = vld1q_u8(src + 48); | ||||||
|  |       src += src_stride; | ||||||
|  |  | ||||||
|  |       vst1q_u8(dst, q0u8); | ||||||
|  |       vst1q_u8(dst + 16, q1u8); | ||||||
|  |       vst1q_u8(dst + 32, q2u8); | ||||||
|  |       vst1q_u8(dst + 48, q3u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else if (w == 32) {  // copy32 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       q0u8 = vld1q_u8(src); | ||||||
|  |       q1u8 = vld1q_u8(src + 16); | ||||||
|  |       src += src_stride; | ||||||
|  |       q2u8 = vld1q_u8(src); | ||||||
|  |       q3u8 = vld1q_u8(src + 16); | ||||||
|  |       src += src_stride; | ||||||
|  |  | ||||||
|  |       vst1q_u8(dst, q0u8); | ||||||
|  |       vst1q_u8(dst + 16, q1u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1q_u8(dst, q2u8); | ||||||
|  |       vst1q_u8(dst + 16, q3u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else if (w > 8) {  // copy16 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       q0u8 = vld1q_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |       q1u8 = vld1q_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |  | ||||||
|  |       vst1q_u8(dst, q0u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1q_u8(dst, q1u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else if (w == 8) {  // copy8 | ||||||
|  |     for (; h > 0; h -= 2) { | ||||||
|  |       d0u8 = vld1_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |       d2u8 = vld1_u8(src); | ||||||
|  |       src += src_stride; | ||||||
|  |  | ||||||
|  |       vst1_u8(dst, d0u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |       vst1_u8(dst, d2u8); | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } else {  // copy4 | ||||||
|  |     for (; h > 0; h--) { | ||||||
|  |       *(uint32_t *)dst = *(const uint32_t *)src; | ||||||
|  |       src += src_stride; | ||||||
|  |       dst += dst_stride; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return; | ||||||
|  | } | ||||||
| @@ -8,14 +8,14 @@ | |||||||
| ;  be found in the AUTHORS file in the root of the source tree. | ;  be found in the AUTHORS file in the root of the source tree. | ||||||
| ; | ; | ||||||
| 
 | 
 | ||||||
|     EXPORT  |vp9_convolve_copy_neon| |     EXPORT  |vpx_convolve_copy_neon| | ||||||
|     ARM |     ARM | ||||||
|     REQUIRE8 |     REQUIRE8 | ||||||
|     PRESERVE8 |     PRESERVE8 | ||||||
| 
 | 
 | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |     AREA ||.text||, CODE, READONLY, ALIGN=2 | ||||||
| 
 | 
 | ||||||
| |vp9_convolve_copy_neon| PROC | |vpx_convolve_copy_neon| PROC | ||||||
|     push                {r4-r5, lr} |     push                {r4-r5, lr} | ||||||
|     ldrd                r4, r5, [sp, #28] |     ldrd                r4, r5, [sp, #28] | ||||||
| 
 | 
 | ||||||
| @@ -8,11 +8,11 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/vp9_common.h" | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                         uint8_t *dst, ptrdiff_t dst_stride, |                         uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                         const int16_t *filter_x, int x_step_q4, |                         const int16_t *filter_x, int x_step_q4, | ||||||
|                         const int16_t *filter_y, int y_step_q4, |                         const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -26,7 +26,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int intermediate_height = h + 7; |   int intermediate_height = h + 7; | ||||||
| 
 | 
 | ||||||
|   if (x_step_q4 != 16 || y_step_q4 != 16) { |   if (x_step_q4 != 16 || y_step_q4 != 16) { | ||||||
|     vp9_convolve8_c(src, src_stride, |     vpx_convolve8_c(src, src_stride, | ||||||
|                     dst, dst_stride, |                     dst, dst_stride, | ||||||
|                     filter_x, x_step_q4, |                     filter_x, x_step_q4, | ||||||
|                     filter_y, y_step_q4, |                     filter_y, y_step_q4, | ||||||
| @@ -39,19 +39,19 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|    * the temp buffer which has lots of extra room and is subsequently discarded |    * the temp buffer which has lots of extra room and is subsequently discarded | ||||||
|    * this is safe if somewhat less than ideal. |    * this is safe if somewhat less than ideal. | ||||||
|    */ |    */ | ||||||
|   vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, |   vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, | ||||||
|                            temp, 64, |                            temp, 64, | ||||||
|                            filter_x, x_step_q4, filter_y, y_step_q4, |                            filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                            w, intermediate_height); |                            w, intermediate_height); | ||||||
| 
 | 
 | ||||||
|   /* Step into the temp buffer 3 lines to get the actual frame data */ |   /* Step into the temp buffer 3 lines to get the actual frame data */ | ||||||
|   vp9_convolve8_vert_neon(temp + 64 * 3, 64, |   vpx_convolve8_vert_neon(temp + 64 * 3, 64, | ||||||
|                           dst, dst_stride, |                           dst, dst_stride, | ||||||
|                           filter_x, x_step_q4, filter_y, y_step_q4, |                           filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                           w, h); |                           w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                             uint8_t *dst, ptrdiff_t dst_stride, |                             uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                             const int16_t *filter_x, int x_step_q4, |                             const int16_t *filter_x, int x_step_q4, | ||||||
|                             const int16_t *filter_y, int y_step_q4, |                             const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -60,7 +60,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int intermediate_height = h + 7; |   int intermediate_height = h + 7; | ||||||
| 
 | 
 | ||||||
|   if (x_step_q4 != 16 || y_step_q4 != 16) { |   if (x_step_q4 != 16 || y_step_q4 != 16) { | ||||||
|     vp9_convolve8_avg_c(src, src_stride, |     vpx_convolve8_avg_c(src, src_stride, | ||||||
|                         dst, dst_stride, |                         dst, dst_stride, | ||||||
|                         filter_x, x_step_q4, |                         filter_x, x_step_q4, | ||||||
|                         filter_y, y_step_q4, |                         filter_y, y_step_q4, | ||||||
| @@ -71,11 +71,11 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   /* This implementation has the same issues as above. In addition, we only want
 |   /* This implementation has the same issues as above. In addition, we only want
 | ||||||
|    * to average the values after both passes. |    * to average the values after both passes. | ||||||
|    */ |    */ | ||||||
|   vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, |   vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, | ||||||
|                            temp, 64, |                            temp, 64, | ||||||
|                            filter_x, x_step_q4, filter_y, y_step_q4, |                            filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                            w, intermediate_height); |                            w, intermediate_height); | ||||||
|   vp9_convolve8_avg_vert_neon(temp + 64 * 3, |   vpx_convolve8_avg_vert_neon(temp + 64 * 3, | ||||||
|                               64, dst, dst_stride, |                               64, dst, dst_stride, | ||||||
|                               filter_x, x_step_q4, filter_y, y_step_q4, |                               filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                               w, h); |                               w, h); | ||||||
| @@ -8,6 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
|  | #include <stdlib.h> | ||||||
|  |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "vpx_dsp/vpx_dsp_common.h" | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
|   | |||||||
| @@ -8,8 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/mips/msa/vp9_convolve_msa.h" | #include "vpx_dsp/mips/vpx_convolve_msa.h" | ||||||
| 
 | 
 | ||||||
| static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, | static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, | ||||||
|                                               int32_t src_stride, |                                               int32_t src_stride, | ||||||
| @@ -687,7 +687,7 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                  uint8_t *dst, ptrdiff_t dst_stride, |                                  uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                  const int16_t *filter_x, int x_step_q4, |                                  const int16_t *filter_x, int x_step_q4, | ||||||
|                                  const int16_t *filter_y, int y_step_q4, |                                  const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -695,14 +695,14 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int8_t cnt, filt_hor[8]; |   int8_t cnt, filt_hor[8]; | ||||||
| 
 | 
 | ||||||
|   if (16 != x_step_q4) { |   if (16 != x_step_q4) { | ||||||
|     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                               filter_x, x_step_q4, filter_y, y_step_q4, |                               filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                               w, h); |                               w, h); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   if (((const int32_t *)filter_x)[1] == 0x800000) { |   if (((const int32_t *)filter_x)[1] == 0x800000) { | ||||||
|     vp9_convolve_avg(src, src_stride, dst, dst_stride, |     vpx_convolve_avg(src, src_stride, dst, dst_stride, | ||||||
|                      filter_x, x_step_q4, filter_y, y_step_q4, |                      filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                      w, h); |                      w, h); | ||||||
|     return; |     return; | ||||||
| @@ -740,7 +740,7 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                           &filt_hor[3], h); |                                           &filt_hor[3], h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, x_step_q4, filter_y, y_step_q4, |                                   filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                                   w, h); |                                   w, h); | ||||||
|         break; |         break; | ||||||
| @@ -773,7 +773,7 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                           filt_hor, h); |                                           filt_hor, h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                                   filter_x, x_step_q4, filter_y, y_step_q4, |                                   filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                                   w, h); |                                   w, h); | ||||||
|         break; |         break; | ||||||
| @@ -8,8 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/mips/msa/vp9_convolve_msa.h" | #include "vpx_dsp/mips/vpx_convolve_msa.h" | ||||||
| 
 | 
 | ||||||
| static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, | static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, | ||||||
|                                                   int32_t src_stride, |                                                   int32_t src_stride, | ||||||
| @@ -576,7 +576,7 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                            uint8_t *dst, ptrdiff_t dst_stride, |                            uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                            const int16_t *filter_x, int x_step_q4, |                            const int16_t *filter_x, int x_step_q4, | ||||||
|                            const int16_t *filter_y, int y_step_q4, |                            const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -584,7 +584,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int8_t cnt, filt_hor[8], filt_ver[8]; |   int8_t cnt, filt_hor[8], filt_ver[8]; | ||||||
| 
 | 
 | ||||||
|   if (16 != x_step_q4 || 16 != y_step_q4) { |   if (16 != x_step_q4 || 16 != y_step_q4) { | ||||||
|     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                         filter_x, x_step_q4, filter_y, y_step_q4, |                         filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                         w, h); |                         w, h); | ||||||
|     return; |     return; | ||||||
| @@ -592,7 +592,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
| 
 | 
 | ||||||
|   if (((const int32_t *)filter_x)[1] == 0x800000 && |   if (((const int32_t *)filter_x)[1] == 0x800000 && | ||||||
|       ((const int32_t *)filter_y)[1] == 0x800000) { |       ((const int32_t *)filter_y)[1] == 0x800000) { | ||||||
|     vp9_convolve_avg(src, src_stride, dst, dst_stride, |     vpx_convolve_avg(src, src_stride, dst, dst_stride, | ||||||
|                      filter_x, x_step_q4, filter_y, y_step_q4, |                      filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                      w, h); |                      w, h); | ||||||
|     return; |     return; | ||||||
| @@ -632,14 +632,14 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                                &filt_hor[3], &filt_ver[3], h); |                                                &filt_hor[3], &filt_ver[3], h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, x_step_q4, filter_y, y_step_q4, |                             filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                             w, h); |                             w, h); | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|   } else if (((const int32_t *)filter_x)[0] == 0 || |   } else if (((const int32_t *)filter_x)[0] == 0 || | ||||||
|              ((const int32_t *)filter_y)[0] == 0) { |              ((const int32_t *)filter_y)[0] == 0) { | ||||||
|     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                         filter_x, x_step_q4, filter_y, y_step_q4, |                         filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                         w, h); |                         w, h); | ||||||
|   } else { |   } else { | ||||||
| @@ -670,7 +670,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                                filt_hor, filt_ver, h); |                                                filt_hor, filt_ver, h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, | ||||||
|                             filter_x, x_step_q4, filter_y, y_step_q4, |                             filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                             w, h); |                             w, h); | ||||||
|         break; |         break; | ||||||
| @@ -8,8 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/mips/msa/vp9_convolve_msa.h" | #include "vpx_dsp/mips/vpx_convolve_msa.h" | ||||||
| 
 | 
 | ||||||
| static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, | static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, | ||||||
|                                              int32_t src_stride, |                                              int32_t src_stride, | ||||||
| @@ -657,7 +657,7 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                 uint8_t *dst, ptrdiff_t dst_stride, |                                 uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                 const int16_t *filter_x, int x_step_q4, |                                 const int16_t *filter_x, int x_step_q4, | ||||||
|                                 const int16_t *filter_y, int y_step_q4, |                                 const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -665,14 +665,14 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int8_t cnt, filt_ver[8]; |   int8_t cnt, filt_ver[8]; | ||||||
| 
 | 
 | ||||||
|   if (16 != y_step_q4) { |   if (16 != y_step_q4) { | ||||||
|     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, x_step_q4, filter_y, y_step_q4, |                              filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                              w, h); |                              w, h); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   if (((const int32_t *)filter_y)[1] == 0x800000) { |   if (((const int32_t *)filter_y)[1] == 0x800000) { | ||||||
|     vp9_convolve_avg(src, src_stride, dst, dst_stride, |     vpx_convolve_avg(src, src_stride, dst, dst_stride, | ||||||
|                      filter_x, x_step_q4, filter_y, y_step_q4, |                      filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                      w, h); |                      w, h); | ||||||
|     return; |     return; | ||||||
| @@ -710,7 +710,7 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                           &filt_ver[3], h); |                                           &filt_ver[3], h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                                  filter_x, x_step_q4, filter_y, y_step_q4, |                                  filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                                  w, h); |                                  w, h); | ||||||
|         break; |         break; | ||||||
| @@ -744,7 +744,7 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                           filt_ver, h); |                                           filt_ver, h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                                  filter_x, x_step_q4, filter_y, y_step_q4, |                                  filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                                  w, h); |                                  w, h); | ||||||
|         break; |         break; | ||||||
| @@ -8,8 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/mips/msa/vp9_convolve_msa.h" | #include "vpx_dsp/mips/vpx_convolve_msa.h" | ||||||
| 
 | 
 | ||||||
| static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, | static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, | ||||||
|                                  uint8_t *dst, int32_t dst_stride, |                                  uint8_t *dst, int32_t dst_stride, | ||||||
| @@ -647,7 +647,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                              uint8_t *dst, ptrdiff_t dst_stride, |                              uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                              const int16_t *filter_x, int x_step_q4, |                              const int16_t *filter_x, int x_step_q4, | ||||||
|                              const int16_t *filter_y, int y_step_q4, |                              const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -655,14 +655,14 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int8_t cnt, filt_hor[8]; |   int8_t cnt, filt_hor[8]; | ||||||
| 
 | 
 | ||||||
|   if (16 != x_step_q4) { |   if (16 != x_step_q4) { | ||||||
|     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                           filter_x, x_step_q4, filter_y, y_step_q4, |                           filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                           w, h); |                           w, h); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   if (((const int32_t *)filter_x)[1] == 0x800000) { |   if (((const int32_t *)filter_x)[1] == 0x800000) { | ||||||
|     vp9_convolve_copy(src, src_stride, dst, dst_stride, |     vpx_convolve_copy(src, src_stride, dst, dst_stride, | ||||||
|                       filter_x, x_step_q4, filter_y, y_step_q4, |                       filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                       w, h); |                       w, h); | ||||||
|     return; |     return; | ||||||
| @@ -700,7 +700,7 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              &filt_hor[3], h); |                              &filt_hor[3], h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                               filter_x, x_step_q4, filter_y, y_step_q4, |                               filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                               w, h); |                               w, h); | ||||||
|         break; |         break; | ||||||
| @@ -733,7 +733,7 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              filt_hor, h); |                              filt_hor, h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | ||||||
|                               filter_x, x_step_q4, filter_y, y_step_q4, |                               filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                               w, h); |                               w, h); | ||||||
|         break; |         break; | ||||||
| @@ -8,8 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/mips/msa/vp9_convolve_msa.h" | #include "vpx_dsp/mips/vpx_convolve_msa.h" | ||||||
| 
 | 
 | ||||||
| const uint8_t mc_filt_mask_arr[16 * 3] = { | const uint8_t mc_filt_mask_arr[16 * 3] = { | ||||||
|   /* 8 width cases */ |   /* 8 width cases */ | ||||||
| @@ -551,7 +551,7 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                        uint8_t *dst, ptrdiff_t dst_stride, |                        uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                        const int16_t *filter_x, int32_t x_step_q4, |                        const int16_t *filter_x, int32_t x_step_q4, | ||||||
|                        const int16_t *filter_y, int32_t y_step_q4, |                        const int16_t *filter_y, int32_t y_step_q4, | ||||||
| @@ -559,7 +559,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int8_t cnt, filt_hor[8], filt_ver[8]; |   int8_t cnt, filt_hor[8], filt_ver[8]; | ||||||
| 
 | 
 | ||||||
|   if (16 != x_step_q4 || 16 != y_step_q4) { |   if (16 != x_step_q4 || 16 != y_step_q4) { | ||||||
|     vp9_convolve8_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                     filter_x, x_step_q4, filter_y, y_step_q4, |                     filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                     w, h); |                     w, h); | ||||||
|     return; |     return; | ||||||
| @@ -567,7 +567,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
| 
 | 
 | ||||||
|   if (((const int32_t *)filter_x)[1] == 0x800000 && |   if (((const int32_t *)filter_x)[1] == 0x800000 && | ||||||
|       ((const int32_t *)filter_y)[1] == 0x800000) { |       ((const int32_t *)filter_y)[1] == 0x800000) { | ||||||
|     vp9_convolve_copy(src, src_stride, dst, dst_stride, |     vpx_convolve_copy(src, src_stride, dst, dst_stride, | ||||||
|                       filter_x, x_step_q4, filter_y, y_step_q4, |                       filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                       w, h); |                       w, h); | ||||||
|     return; |     return; | ||||||
| @@ -607,14 +607,14 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   &filt_hor[3], &filt_ver[3], (int32_t)h); |                                   &filt_hor[3], &filt_ver[3], (int32_t)h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                         filter_x, x_step_q4, filter_y, y_step_q4, |                         filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                         w, h); |                         w, h); | ||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|   } else if (((const int32_t *)filter_x)[0] == 0 || |   } else if (((const int32_t *)filter_x)[0] == 0 || | ||||||
|              ((const int32_t *)filter_y)[0] == 0) { |              ((const int32_t *)filter_y)[0] == 0) { | ||||||
|     vp9_convolve8_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                     filter_x, x_step_q4, filter_y, y_step_q4, |                     filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                     w, h); |                     w, h); | ||||||
|   } else { |   } else { | ||||||
| @@ -645,7 +645,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                                   filt_hor, filt_ver, (int32_t)h); |                                   filt_hor, filt_ver, (int32_t)h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_c(src, src_stride, dst, dst_stride, | ||||||
|                         filter_x, x_step_q4, filter_y, y_step_q4, |                         filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                         w, h); |                         w, h); | ||||||
|         break; |         break; | ||||||
| @@ -8,8 +8,8 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/mips/msa/vp9_convolve_msa.h" | #include "vpx_dsp/mips/vpx_convolve_msa.h" | ||||||
| 
 | 
 | ||||||
| static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, | static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, | ||||||
|                                 uint8_t *dst, int32_t dst_stride, |                                 uint8_t *dst, int32_t dst_stride, | ||||||
| @@ -650,7 +650,7 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                             uint8_t *dst, ptrdiff_t dst_stride, |                             uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                             const int16_t *filter_x, int x_step_q4, |                             const int16_t *filter_x, int x_step_q4, | ||||||
|                             const int16_t *filter_y, int y_step_q4, |                             const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -658,14 +658,14 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   int8_t cnt, filt_ver[8]; |   int8_t cnt, filt_ver[8]; | ||||||
| 
 | 
 | ||||||
|   if (16 != y_step_q4) { |   if (16 != y_step_q4) { | ||||||
|     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |     vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                          filter_x, x_step_q4, filter_y, y_step_q4, |                          filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                          w, h); |                          w, h); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   if (((const int32_t *)filter_y)[1] == 0x800000) { |   if (((const int32_t *)filter_y)[1] == 0x800000) { | ||||||
|     vp9_convolve_copy(src, src_stride, dst, dst_stride, |     vpx_convolve_copy(src, src_stride, dst, dst_stride, | ||||||
|                       filter_x, x_step_q4, filter_y, y_step_q4, |                       filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                       w, h); |                       w, h); | ||||||
|     return; |     return; | ||||||
| @@ -703,7 +703,7 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              &filt_ver[3], h); |                              &filt_ver[3], h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, x_step_q4, filter_y, y_step_q4, |                              filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                              w, h); |                              w, h); | ||||||
|         break; |         break; | ||||||
| @@ -736,7 +736,7 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                              filt_ver, h); |                              filt_ver, h); | ||||||
|         break; |         break; | ||||||
|       default: |       default: | ||||||
|         vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |         vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | ||||||
|                              filter_x, x_step_q4, filter_y, y_step_q4, |                              filter_x, x_step_q4, filter_y, y_step_q4, | ||||||
|                              w, h); |                              w, h); | ||||||
|         break; |         break; | ||||||
| @@ -186,7 +186,7 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                           uint8_t *dst, ptrdiff_t dst_stride, |                           uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                           const int16_t *filter_x, int32_t filter_x_stride, |                           const int16_t *filter_x, int32_t filter_x_stride, | ||||||
|                           const int16_t *filter_y, int32_t filter_y_stride, |                           const int16_t *filter_y, int32_t filter_y_stride, | ||||||
| @@ -196,7 +196,7 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride, | |||||||
|   copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); |   copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                            uint8_t *dst, ptrdiff_t dst_stride, |                            uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                            const int16_t *filter_x, int32_t filter_x_stride, |                            const int16_t *filter_x, int32_t filter_x_stride, | ||||||
|                            const int16_t *filter_y, int32_t filter_y_stride, |                            const int16_t *filter_y, int32_t filter_y_stride, | ||||||
| @@ -8,11 +8,11 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ | #ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ | ||||||
| #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ | #define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ | ||||||
| 
 | 
 | ||||||
| #include "vp9/common/vp9_filter.h" |  | ||||||
| #include "vpx_dsp/mips/macros_msa.h" | #include "vpx_dsp/mips/macros_msa.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
| 
 | 
 | ||||||
| extern const uint8_t mc_filt_mask_arr[16 * 3]; | extern const uint8_t mc_filt_mask_arr[16 * 3]; | ||||||
| 
 | 
 | ||||||
| @@ -116,4 +116,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; | |||||||
|   AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \ |   AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \ | ||||||
|   ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \ |   ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \ | ||||||
| } | } | ||||||
| #endif  /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ | #endif  /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ | ||||||
| @@ -9,13 +9,14 @@ | |||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include <assert.h> | #include <assert.h> | ||||||
|  | #include <string.h> | ||||||
| 
 | 
 | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "./vp9_rtcd.h" | #include "./vpx_dsp_rtcd.h" | ||||||
| #include "vp9/common/vp9_common.h" |  | ||||||
| #include "vp9/common/vp9_convolve.h" |  | ||||||
| #include "vp9/common/vp9_filter.h" |  | ||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
|  | #include "vpx_dsp/vpx_convolve.h" | ||||||
|  | #include "vpx_dsp/vpx_dsp_common.h" | ||||||
|  | #include "vpx_dsp/vpx_filter.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| 
 | 
 | ||||||
| static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, | static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, | ||||||
| @@ -154,7 +155,7 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) { | |||||||
|   return (int)((const InterpKernel *)(intptr_t)f - base); |   return (int)((const InterpKernel *)(intptr_t)f - base); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                            uint8_t *dst, ptrdiff_t dst_stride, |                            uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                            const int16_t *filter_x, int x_step_q4, |                            const int16_t *filter_x, int x_step_q4, | ||||||
|                            const int16_t *filter_y, int y_step_q4, |                            const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -169,7 +170,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                  x0_q4, x_step_q4, w, h); |                  x0_q4, x_step_q4, w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                uint8_t *dst, ptrdiff_t dst_stride, |                                uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                const int16_t *filter_x, int x_step_q4, |                                const int16_t *filter_x, int x_step_q4, | ||||||
|                                const int16_t *filter_y, int y_step_q4, |                                const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -184,7 +185,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                      x0_q4, x_step_q4, w, h); |                      x0_q4, x_step_q4, w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                           uint8_t *dst, ptrdiff_t dst_stride, |                           uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                           const int16_t *filter_x, int x_step_q4, |                           const int16_t *filter_x, int x_step_q4, | ||||||
|                           const int16_t *filter_y, int y_step_q4, |                           const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -199,7 +200,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                 y0_q4, y_step_q4, w, h); |                 y0_q4, y_step_q4, w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                               uint8_t *dst, ptrdiff_t dst_stride, |                               uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                               const int16_t *filter_x, int x_step_q4, |                               const int16_t *filter_x, int x_step_q4, | ||||||
|                               const int16_t *filter_y, int y_step_q4, |                               const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -214,7 +215,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                     y0_q4, y_step_q4, w, h); |                     y0_q4, y_step_q4, w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                      uint8_t *dst, ptrdiff_t dst_stride, |                      uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                      const int16_t *filter_x, int x_step_q4, |                      const int16_t *filter_x, int x_step_q4, | ||||||
|                      const int16_t *filter_y, int y_step_q4, |                      const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -230,7 +231,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|            filters_y, y0_q4, y_step_q4, w, h); |            filters_y, y0_q4, y_step_q4, w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                          uint8_t *dst, ptrdiff_t dst_stride, |                          uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                          const int16_t *filter_x, int x_step_q4, |                          const int16_t *filter_x, int x_step_q4, | ||||||
|                          const int16_t *filter_y, int y_step_q4, |                          const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -240,12 +241,12 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   assert(w <= 64); |   assert(w <= 64); | ||||||
|   assert(h <= 64); |   assert(h <= 64); | ||||||
| 
 | 
 | ||||||
|   vp9_convolve8_c(src, src_stride, temp, 64, |   vpx_convolve8_c(src, src_stride, temp, 64, | ||||||
|                   filter_x, x_step_q4, filter_y, y_step_q4, w, h); |                   filter_x, x_step_q4, filter_y, y_step_q4, w, h); | ||||||
|   vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); |   vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                          uint8_t *dst, ptrdiff_t dst_stride, |                          uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                          const int16_t *filter_x, int filter_x_stride, |                          const int16_t *filter_x, int filter_x_stride, | ||||||
|                          const int16_t *filter_y, int filter_y_stride, |                          const int16_t *filter_y, int filter_y_stride, | ||||||
| @@ -262,7 +263,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                         uint8_t *dst, ptrdiff_t dst_stride, |                         uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                         const int16_t *filter_x, int filter_x_stride, |                         const int16_t *filter_x, int filter_x_stride, | ||||||
|                         const int16_t *filter_y, int filter_y_stride, |                         const int16_t *filter_y, int filter_y_stride, | ||||||
| @@ -423,7 +424,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                   uint8_t *dst, ptrdiff_t dst_stride, |                                   uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                   const int16_t *filter_x, int x_step_q4, |                                   const int16_t *filter_x, int x_step_q4, | ||||||
|                                   const int16_t *filter_y, int y_step_q4, |                                   const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -437,7 +438,7 @@ void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                         x0_q4, x_step_q4, w, h, bd); |                         x0_q4, x_step_q4, w, h, bd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                       uint8_t *dst, ptrdiff_t dst_stride, |                                       uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                       const int16_t *filter_x, int x_step_q4, |                                       const int16_t *filter_x, int x_step_q4, | ||||||
|                                       const int16_t *filter_y, int y_step_q4, |                                       const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -451,7 +452,7 @@ void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                             x0_q4, x_step_q4, w, h, bd); |                             x0_q4, x_step_q4, w, h, bd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                  uint8_t *dst, ptrdiff_t dst_stride, |                                  uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                  const int16_t *filter_x, int x_step_q4, |                                  const int16_t *filter_x, int x_step_q4, | ||||||
|                                  const int16_t *filter_y, int y_step_q4, |                                  const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -465,7 +466,7 @@ void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                        y0_q4, y_step_q4, w, h, bd); |                        y0_q4, y_step_q4, w, h, bd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                      uint8_t *dst, ptrdiff_t dst_stride, |                                      uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                      const int16_t *filter_x, int x_step_q4, |                                      const int16_t *filter_x, int x_step_q4, | ||||||
|                                      const int16_t *filter_y, int y_step_q4, |                                      const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -479,7 +480,7 @@ void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                            y0_q4, y_step_q4, w, h, bd); |                            y0_q4, y_step_q4, w, h, bd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                             uint8_t *dst, ptrdiff_t dst_stride, |                             uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                             const int16_t *filter_x, int x_step_q4, |                             const int16_t *filter_x, int x_step_q4, | ||||||
|                             const int16_t *filter_y, int y_step_q4, |                             const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -495,7 +496,7 @@ void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|                   filters_y, y0_q4, y_step_q4, w, h, bd); |                   filters_y, y0_q4, y_step_q4, w, h, bd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, | void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, | ||||||
|                                 uint8_t *dst, ptrdiff_t dst_stride, |                                 uint8_t *dst, ptrdiff_t dst_stride, | ||||||
|                                 const int16_t *filter_x, int x_step_q4, |                                 const int16_t *filter_x, int x_step_q4, | ||||||
|                                 const int16_t *filter_y, int y_step_q4, |                                 const int16_t *filter_y, int y_step_q4, | ||||||
| @@ -505,13 +506,13 @@ void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, | |||||||
|   assert(w <= 64); |   assert(w <= 64); | ||||||
|   assert(h <= 64); |   assert(h <= 64); | ||||||
| 
 | 
 | ||||||
|   vp9_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, |   vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, | ||||||
|                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); |                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); | ||||||
|   vp9_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, |   vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, | ||||||
|                             NULL, 0, NULL, 0, w, h, bd); |                             NULL, 0, NULL, 0, w, h, bd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, | void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, | ||||||
|                                 uint8_t *dst8, ptrdiff_t dst_stride, |                                 uint8_t *dst8, ptrdiff_t dst_stride, | ||||||
|                                 const int16_t *filter_x, int filter_x_stride, |                                 const int16_t *filter_x, int filter_x_stride, | ||||||
|                                 const int16_t *filter_y, int filter_y_stride, |                                 const int16_t *filter_y, int filter_y_stride, | ||||||
| @@ -532,7 +533,7 @@ void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, | void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, | ||||||
|                                uint8_t *dst8, ptrdiff_t dst_stride, |                                uint8_t *dst8, ptrdiff_t dst_stride, | ||||||
|                                const int16_t *filter_x, int filter_x_stride, |                                const int16_t *filter_x, int filter_x_stride, | ||||||
|                                const int16_t *filter_y, int filter_y_stride, |                                const int16_t *filter_y, int filter_y_stride, | ||||||
| @@ -7,8 +7,8 @@ | |||||||
|  *  in the file PATENTS.  All contributing project authors may |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| #ifndef VP9_COMMON_VP9_CONVOLVE_H_ | #ifndef VPX_DSP_VPX_CONVOLVE_H_ | ||||||
| #define VP9_COMMON_VP9_CONVOLVE_H_ | #define VPX_DSP_VPX_CONVOLVE_H_ | ||||||
| 
 | 
 | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
| @@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, | |||||||
| }  // extern "C"
 | }  // extern "C"
 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #endif  // VP9_COMMON_VP9_CONVOLVE_H_
 | #endif  // VPX_DSP_VPX_CONVOLVE_H_
 | ||||||
| @@ -54,6 +54,54 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c | |||||||
| DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c | DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c | ||||||
| endif  # CONFIG_VP9 | endif  # CONFIG_VP9 | ||||||
|  |  | ||||||
|  | # interpolation filters | ||||||
|  | DSP_SRCS-yes += vpx_convolve.c | ||||||
|  | DSP_SRCS-yes += vpx_convolve.h | ||||||
|  | DSP_SRCS-yes += vpx_filter.h | ||||||
|  |  | ||||||
|  | DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h | ||||||
|  | DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c | ||||||
|  | DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm | ||||||
|  | DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm | ||||||
|  | DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm | ||||||
|  | DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm | ||||||
|  | DSP_SRCS-$(HAVE_AVX2)  += x86/vpx_subpixel_8t_intrin_avx2.c | ||||||
|  | DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c | ||||||
|  | ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) | ||||||
|  | DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm | ||||||
|  | DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm | ||||||
|  | endif | ||||||
|  | ifeq ($(CONFIG_USE_X86INC),yes) | ||||||
|  | DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm | ||||||
|  | endif | ||||||
|  |  | ||||||
|  | ifeq ($(HAVE_NEON_ASM),yes) | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM) | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM) | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM) | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve_neon.c | ||||||
|  | else | ||||||
|  | ifeq ($(HAVE_NEON),yes) | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve8_neon.c | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c | ||||||
|  | DSP_SRCS-yes += arm/vpx_convolve_neon.c | ||||||
|  | endif  # HAVE_NEON | ||||||
|  | endif  # HAVE_NEON_ASM | ||||||
|  |  | ||||||
|  | # common (msa) | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c | ||||||
|  | DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h | ||||||
|  |  | ||||||
| # loop filters | # loop filters | ||||||
| DSP_SRCS-yes += loopfilter.c | DSP_SRCS-yes += loopfilter.c | ||||||
|  |  | ||||||
|   | |||||||
| @@ -11,8 +11,6 @@ | |||||||
| #ifndef VPX_DSP_COMMON_H_ | #ifndef VPX_DSP_COMMON_H_ | ||||||
| #define VPX_DSP_COMMON_H_ | #define VPX_DSP_COMMON_H_ | ||||||
|  |  | ||||||
| #include <stdlib.h> |  | ||||||
|  |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "vpx/vpx_integer.h" | #include "vpx/vpx_integer.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
|   | |||||||
| @@ -34,6 +34,12 @@ if (vpx_config("CONFIG_USE_X86INC") eq "yes") { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | # optimizations which depend on multiple features | ||||||
|  | $avx2_ssse3 = ''; | ||||||
|  | if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) { | ||||||
|  |   $avx2_ssse3 = 'avx2'; | ||||||
|  | } | ||||||
|  |  | ||||||
| # functions that are 64 bit only. | # functions that are 64 bit only. | ||||||
| $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = ''; | $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = ''; | ||||||
| if ($opts{arch} eq "x86_64") { | if ($opts{arch} eq "x86_64") { | ||||||
| @@ -365,6 +371,62 @@ if (vpx_config("CONFIG_VP9") eq "yes") { | |||||||
|   }  # CONFIG_VP9_HIGHBITDEPTH |   }  # CONFIG_VP9_HIGHBITDEPTH | ||||||
| }  # CONFIG_VP9 | }  # CONFIG_VP9 | ||||||
|  |  | ||||||
|  | # | ||||||
|  | # Sub Pixel Filters | ||||||
|  | # | ||||||
|  | add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve_copy neon msa/, "$sse2_x86inc"; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve_avg neon msa/, "$sse2_x86inc"; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve8 sse2 ssse3 neon msa/, "$avx2_ssse3"; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve8_horiz sse2 ssse3 neon msa/, "$avx2_ssse3"; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve8_vert sse2 ssse3 neon msa/, "$avx2_ssse3"; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve8_avg sse2 ssse3 neon msa/; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon msa/; | ||||||
|  |  | ||||||
|  | add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; | ||||||
|  | specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon msa/; | ||||||
|  |  | ||||||
|  | if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { | ||||||
|  |   # | ||||||
|  |   # Sub Pixel Filters | ||||||
|  |   # | ||||||
|  |   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve_copy/; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve_avg/; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64"; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64"; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64"; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64"; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64"; | ||||||
|  |  | ||||||
|  |   add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; | ||||||
|  |   specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64"; | ||||||
|  | }  # CONFIG_VP9_HIGHBITDEPTH | ||||||
|  |  | ||||||
| # | # | ||||||
| # Loopfilter | # Loopfilter | ||||||
| # | # | ||||||
|   | |||||||
							
								
								
									
										34
									
								
								vpx_dsp/vpx_filter.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								vpx_dsp/vpx_filter.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved. | ||||||
|  |  * | ||||||
|  |  *  Use of this source code is governed by a BSD-style license | ||||||
|  |  *  that can be found in the LICENSE file in the root of the source | ||||||
|  |  *  tree. An additional intellectual property rights grant can be found | ||||||
|  |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #ifndef VPX_DSP_VPX_FILTER_H_ | ||||||
|  | #define VPX_DSP_VPX_FILTER_H_ | ||||||
|  |  | ||||||
|  | #include "vpx/vpx_integer.h" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #ifdef __cplusplus | ||||||
|  | extern "C" { | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #define FILTER_BITS 7 | ||||||
|  |  | ||||||
|  | #define SUBPEL_BITS 4 | ||||||
|  | #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) | ||||||
|  | #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) | ||||||
|  | #define SUBPEL_TAPS 8 | ||||||
|  |  | ||||||
|  | typedef int16_t InterpKernel[SUBPEL_TAPS]; | ||||||
|  |  | ||||||
|  | #ifdef __cplusplus | ||||||
|  | }  // extern "C" | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #endif  // VPX_DSP_VPX_FILTER_H_ | ||||||
| @@ -7,8 +7,8 @@ | |||||||
|  *  in the file PATENTS.  All contributing project authors may |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| #ifndef VP9_COMMON_X86_CONVOLVE_H_ | #ifndef VPX_DSP_X86_CONVOLVE_H_ | ||||||
| #define VP9_COMMON_X86_CONVOLVE_H_ | #define VPX_DSP_X86_CONVOLVE_H_ | ||||||
| 
 | 
 | ||||||
| #include <assert.h> | #include <assert.h> | ||||||
| 
 | 
 | ||||||
| @@ -26,7 +26,7 @@ typedef void filter8_1dfunction ( | |||||||
| ); | ); | ||||||
| 
 | 
 | ||||||
| #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ | #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ | ||||||
|   void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ |   void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ | ||||||
|                                     uint8_t *dst, ptrdiff_t dst_stride, \ |                                     uint8_t *dst, ptrdiff_t dst_stride, \ | ||||||
|                                     const int16_t *filter_x, int x_step_q4, \ |                                     const int16_t *filter_x, int x_step_q4, \ | ||||||
|                                     const int16_t *filter_y, int y_step_q4, \ |                                     const int16_t *filter_y, int y_step_q4, \ | ||||||
| @@ -34,7 +34,7 @@ typedef void filter8_1dfunction ( | |||||||
|   if (step_q4 == 16 && filter[3] != 128) { \ |   if (step_q4 == 16 && filter[3] != 128) { \ | ||||||
|     if (filter[0] || filter[1] || filter[2]) { \ |     if (filter[0] || filter[1] || filter[2]) { \ | ||||||
|       while (w >= 16) { \ |       while (w >= 16) { \ | ||||||
|         vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ |         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \ | ||||||
|                                                  src_stride, \ |                                                  src_stride, \ | ||||||
|                                                  dst, \ |                                                  dst, \ | ||||||
|                                                  dst_stride, \ |                                                  dst_stride, \ | ||||||
| @@ -45,7 +45,7 @@ typedef void filter8_1dfunction ( | |||||||
|         w -= 16; \ |         w -= 16; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 8) { \ |       while (w >= 8) { \ | ||||||
|         vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ |         vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \ | ||||||
|                                                 src_stride, \ |                                                 src_stride, \ | ||||||
|                                                 dst, \ |                                                 dst, \ | ||||||
|                                                 dst_stride, \ |                                                 dst_stride, \ | ||||||
| @@ -56,7 +56,7 @@ typedef void filter8_1dfunction ( | |||||||
|         w -= 8; \ |         w -= 8; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 4) { \ |       while (w >= 4) { \ | ||||||
|         vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ |         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \ | ||||||
|                                                 src_stride, \ |                                                 src_stride, \ | ||||||
|                                                 dst, \ |                                                 dst, \ | ||||||
|                                                 dst_stride, \ |                                                 dst_stride, \ | ||||||
| @@ -68,7 +68,7 @@ typedef void filter8_1dfunction ( | |||||||
|       } \ |       } \ | ||||||
|     } else { \ |     } else { \ | ||||||
|       while (w >= 16) { \ |       while (w >= 16) { \ | ||||||
|         vp9_filter_block1d16_##dir##2_##avg##opt(src, \ |         vpx_filter_block1d16_##dir##2_##avg##opt(src, \ | ||||||
|                                                  src_stride, \ |                                                  src_stride, \ | ||||||
|                                                  dst, \ |                                                  dst, \ | ||||||
|                                                  dst_stride, \ |                                                  dst_stride, \ | ||||||
| @@ -79,7 +79,7 @@ typedef void filter8_1dfunction ( | |||||||
|         w -= 16; \ |         w -= 16; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 8) { \ |       while (w >= 8) { \ | ||||||
|         vp9_filter_block1d8_##dir##2_##avg##opt(src, \ |         vpx_filter_block1d8_##dir##2_##avg##opt(src, \ | ||||||
|                                                 src_stride, \ |                                                 src_stride, \ | ||||||
|                                                 dst, \ |                                                 dst, \ | ||||||
|                                                 dst_stride, \ |                                                 dst_stride, \ | ||||||
| @@ -90,7 +90,7 @@ typedef void filter8_1dfunction ( | |||||||
|         w -= 8; \ |         w -= 8; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 4) { \ |       while (w >= 4) { \ | ||||||
|         vp9_filter_block1d4_##dir##2_##avg##opt(src, \ |         vpx_filter_block1d4_##dir##2_##avg##opt(src, \ | ||||||
|                                                 src_stride, \ |                                                 src_stride, \ | ||||||
|                                                 dst, \ |                                                 dst, \ | ||||||
|                                                 dst_stride, \ |                                                 dst_stride, \ | ||||||
| @@ -103,14 +103,14 @@ typedef void filter8_1dfunction ( | |||||||
|     } \ |     } \ | ||||||
|   } \ |   } \ | ||||||
|   if (w) { \ |   if (w) { \ | ||||||
|     vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ |     vpx_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ | ||||||
|                              filter_x, x_step_q4, filter_y, y_step_q4, \ |                              filter_x, x_step_q4, filter_y, y_step_q4, \ | ||||||
|                              w, h); \ |                              w, h); \ | ||||||
|   } \ |   } \ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define FUN_CONV_2D(avg, opt) \ | #define FUN_CONV_2D(avg, opt) \ | ||||||
| void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ | void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ | ||||||
|                               uint8_t *dst, ptrdiff_t dst_stride, \ |                               uint8_t *dst, ptrdiff_t dst_stride, \ | ||||||
|                               const int16_t *filter_x, int x_step_q4, \ |                               const int16_t *filter_x, int x_step_q4, \ | ||||||
|                               const int16_t *filter_y, int y_step_q4, \ |                               const int16_t *filter_y, int y_step_q4, \ | ||||||
| @@ -121,23 +121,23 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ | |||||||
|     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ |     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ | ||||||
|         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ |         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ | ||||||
|       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ |       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ | ||||||
|       vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ |       vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ | ||||||
|                                 filter_x, x_step_q4, filter_y, y_step_q4, \ |                                 filter_x, x_step_q4, filter_y, y_step_q4, \ | ||||||
|                                 w, h + 7); \ |                                 w, h + 7); \ | ||||||
|       vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ |       vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ | ||||||
|                                       filter_x, x_step_q4, filter_y, \ |                                       filter_x, x_step_q4, filter_y, \ | ||||||
|                                       y_step_q4, w, h); \ |                                       y_step_q4, w, h); \ | ||||||
|     } else { \ |     } else { \ | ||||||
|       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ |       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ | ||||||
|       vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ |       vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ | ||||||
|                                 filter_x, x_step_q4, filter_y, y_step_q4, \ |                                 filter_x, x_step_q4, filter_y, y_step_q4, \ | ||||||
|                                 w, h + 1); \ |                                 w, h + 1); \ | ||||||
|       vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ |       vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ | ||||||
|                                       filter_x, x_step_q4, filter_y, \ |                                       filter_x, x_step_q4, filter_y, \ | ||||||
|                                       y_step_q4, w, h); \ |                                       y_step_q4, w, h); \ | ||||||
|     } \ |     } \ | ||||||
|   } else { \ |   } else { \ | ||||||
|     vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ |     vpx_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ | ||||||
|                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ |                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ | ||||||
|   } \ |   } \ | ||||||
| } | } | ||||||
| @@ -155,7 +155,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
| ); | ); | ||||||
| 
 | 
 | ||||||
| #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ | #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ | ||||||
|   void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ |   void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ | ||||||
|                                            ptrdiff_t src_stride, \ |                                            ptrdiff_t src_stride, \ | ||||||
|                                            uint8_t *dst8, \ |                                            uint8_t *dst8, \ | ||||||
|                                            ptrdiff_t dst_stride, \ |                                            ptrdiff_t dst_stride, \ | ||||||
| @@ -169,7 +169,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ |     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ | ||||||
|     if (filter[0] || filter[1] || filter[2]) { \ |     if (filter[0] || filter[1] || filter[2]) { \ | ||||||
|       while (w >= 16) { \ |       while (w >= 16) { \ | ||||||
|         vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ |         vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ | ||||||
|                                                         src_stride, \ |                                                         src_stride, \ | ||||||
|                                                         dst, \ |                                                         dst, \ | ||||||
|                                                         dst_stride, \ |                                                         dst_stride, \ | ||||||
| @@ -181,7 +181,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|         w -= 16; \ |         w -= 16; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 8) { \ |       while (w >= 8) { \ | ||||||
|         vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ |         vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ | ||||||
|                                                        src_stride, \ |                                                        src_stride, \ | ||||||
|                                                        dst, \ |                                                        dst, \ | ||||||
|                                                        dst_stride, \ |                                                        dst_stride, \ | ||||||
| @@ -193,7 +193,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|         w -= 8; \ |         w -= 8; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 4) { \ |       while (w >= 4) { \ | ||||||
|         vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ |         vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ | ||||||
|                                                        src_stride, \ |                                                        src_stride, \ | ||||||
|                                                        dst, \ |                                                        dst, \ | ||||||
|                                                        dst_stride, \ |                                                        dst_stride, \ | ||||||
| @@ -206,7 +206,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|       } \ |       } \ | ||||||
|     } else { \ |     } else { \ | ||||||
|       while (w >= 16) { \ |       while (w >= 16) { \ | ||||||
|         vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ |         vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ | ||||||
|                                                         src_stride, \ |                                                         src_stride, \ | ||||||
|                                                         dst, \ |                                                         dst, \ | ||||||
|                                                         dst_stride, \ |                                                         dst_stride, \ | ||||||
| @@ -218,7 +218,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|         w -= 16; \ |         w -= 16; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 8) { \ |       while (w >= 8) { \ | ||||||
|         vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ |         vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ | ||||||
|                                                        src_stride, \ |                                                        src_stride, \ | ||||||
|                                                        dst, \ |                                                        dst, \ | ||||||
|                                                        dst_stride, \ |                                                        dst_stride, \ | ||||||
| @@ -230,7 +230,7 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|         w -= 8; \ |         w -= 8; \ | ||||||
|       } \ |       } \ | ||||||
|       while (w >= 4) { \ |       while (w >= 4) { \ | ||||||
|         vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ |         vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ | ||||||
|                                                        src_stride, \ |                                                        src_stride, \ | ||||||
|                                                        dst, \ |                                                        dst, \ | ||||||
|                                                        dst_stride, \ |                                                        dst_stride, \ | ||||||
| @@ -244,14 +244,14 @@ typedef void highbd_filter8_1dfunction ( | |||||||
|     } \ |     } \ | ||||||
|   } \ |   } \ | ||||||
|   if (w) { \ |   if (w) { \ | ||||||
|     vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ |     vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ | ||||||
|                                     filter_x, x_step_q4, filter_y, y_step_q4, \ |                                     filter_x, x_step_q4, filter_y, y_step_q4, \ | ||||||
|                                     w, h, bd); \ |                                     w, h, bd); \ | ||||||
|   } \ |   } \ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define HIGH_FUN_CONV_2D(avg, opt) \ | #define HIGH_FUN_CONV_2D(avg, opt) \ | ||||||
| void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ | void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ | ||||||
|                                      uint8_t *dst, ptrdiff_t dst_stride, \ |                                      uint8_t *dst, ptrdiff_t dst_stride, \ | ||||||
|                                      const int16_t *filter_x, int x_step_q4, \ |                                      const int16_t *filter_x, int x_step_q4, \ | ||||||
|                                      const int16_t *filter_y, int y_step_q4, \ |                                      const int16_t *filter_y, int y_step_q4, \ | ||||||
| @@ -262,35 +262,35 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ | |||||||
|     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ |     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ | ||||||
|         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ |         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ | ||||||
|       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ |       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ | ||||||
|       vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ |       vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ | ||||||
|                                        CONVERT_TO_BYTEPTR(fdata2), 64, \ |                                        CONVERT_TO_BYTEPTR(fdata2), 64, \ | ||||||
|                                        filter_x, x_step_q4, \ |                                        filter_x, x_step_q4, \ | ||||||
|                                        filter_y, y_step_q4, \ |                                        filter_y, y_step_q4, \ | ||||||
|                                        w, h + 7, bd); \ |                                        w, h + 7, bd); \ | ||||||
|       vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ |       vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ | ||||||
|                                              64, dst, dst_stride, \ |                                              64, dst, dst_stride, \ | ||||||
|                                              filter_x, x_step_q4, \ |                                              filter_x, x_step_q4, \ | ||||||
|                                              filter_y, y_step_q4, \ |                                              filter_y, y_step_q4, \ | ||||||
|                                              w, h, bd); \ |                                              w, h, bd); \ | ||||||
|     } else { \ |     } else { \ | ||||||
|       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ |       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ | ||||||
|       vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ |       vpx_highbd_convolve8_horiz_##opt(src, src_stride, \ | ||||||
|                                        CONVERT_TO_BYTEPTR(fdata2), 64, \ |                                        CONVERT_TO_BYTEPTR(fdata2), 64, \ | ||||||
|                                        filter_x, x_step_q4, \ |                                        filter_x, x_step_q4, \ | ||||||
|                                        filter_y, y_step_q4, \ |                                        filter_y, y_step_q4, \ | ||||||
|                                        w, h + 1, bd); \ |                                        w, h + 1, bd); \ | ||||||
|       vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ |       vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ | ||||||
|                                              dst, dst_stride, \ |                                              dst, dst_stride, \ | ||||||
|                                              filter_x, x_step_q4, \ |                                              filter_x, x_step_q4, \ | ||||||
|                                              filter_y, y_step_q4, \ |                                              filter_y, y_step_q4, \ | ||||||
|                                              w, h, bd); \ |                                              w, h, bd); \ | ||||||
|     } \ |     } \ | ||||||
|   } else { \ |   } else { \ | ||||||
|     vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ |     vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ | ||||||
|                                   filter_x, x_step_q4, filter_y, y_step_q4, w, \ |                                   filter_x, x_step_q4, filter_y, y_step_q4, w, \ | ||||||
|                                   h, bd); \ |                                   h, bd); \ | ||||||
|   } \ |   } \ | ||||||
| } | } | ||||||
| #endif  // CONFIG_VP9_HIGHBITDEPTH
 | #endif  // CONFIG_VP9_HIGHBITDEPTH
 | ||||||
| 
 | 
 | ||||||
| #endif  // VP9_COMMON_X86_CONVOLVE_H_
 | #endif  // VPX_DSP_X86_CONVOLVE_H_
 | ||||||
| @@ -8,53 +8,53 @@ | |||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include "./vp9_rtcd.h" |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
| #include "vp9/common/x86/convolve.h" | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx_dsp/x86/convolve.h" | ||||||
| 
 | 
 | ||||||
| #if HAVE_SSE2 | #if HAVE_SSE2 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v8_sse2; | filter8_1dfunction vpx_filter_block1d16_v8_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h8_sse2; | filter8_1dfunction vpx_filter_block1d16_h8_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_sse2; | filter8_1dfunction vpx_filter_block1d8_v8_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_sse2; | filter8_1dfunction vpx_filter_block1d8_h8_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v8_sse2; | filter8_1dfunction vpx_filter_block1d4_v8_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_sse2; | filter8_1dfunction vpx_filter_block1d4_h8_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; | filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; | filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; | filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; | filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; | filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; | filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; | ||||||
| 
 | 
 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v2_sse2; | filter8_1dfunction vpx_filter_block1d16_v2_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h2_sse2; | filter8_1dfunction vpx_filter_block1d16_h2_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v2_sse2; | filter8_1dfunction vpx_filter_block1d8_v2_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h2_sse2; | filter8_1dfunction vpx_filter_block1d8_h2_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v2_sse2; | filter8_1dfunction vpx_filter_block1d4_v2_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h2_sse2; | filter8_1dfunction vpx_filter_block1d4_h2_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; | filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; | filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; | filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; | filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; | filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; | filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; | ||||||
| 
 | 
 | ||||||
| // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                               uint8_t *dst, ptrdiff_t dst_stride,
 | //                               uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                               const int16_t *filter_x, int x_step_q4,
 | //                               const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                               const int16_t *filter_y, int y_step_q4,
 | //                               const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                               int w, int h);
 | //                               int w, int h);
 | ||||||
| // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                              uint8_t *dst, ptrdiff_t dst_stride,
 | //                              uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                              const int16_t *filter_x, int x_step_q4,
 | //                              const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                              const int16_t *filter_y, int y_step_q4,
 | //                              const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                              int w, int h);
 | //                              int w, int h);
 | ||||||
| // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                   uint8_t *dst, ptrdiff_t dst_stride,
 | //                                   uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                   const int16_t *filter_x, int x_step_q4,
 | //                                   const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                   const int16_t *filter_y, int y_step_q4,
 | //                                   const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                                   int w, int h);
 | //                                   int w, int h);
 | ||||||
| // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                  uint8_t *dst, ptrdiff_t dst_stride,
 | //                                  uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                  const int16_t *filter_x, int x_step_q4,
 | //                                  const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                  const int16_t *filter_y, int y_step_q4,
 | //                                  const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -64,12 +64,12 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); | |||||||
| FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); | FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); | ||||||
| FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); | FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); | ||||||
| 
 | 
 | ||||||
| // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                         uint8_t *dst, ptrdiff_t dst_stride,
 | //                         uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                         const int16_t *filter_x, int x_step_q4,
 | //                         const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                         const int16_t *filter_y, int y_step_q4,
 | //                         const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                         int w, int h);
 | //                         int w, int h);
 | ||||||
| // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                             uint8_t *dst, ptrdiff_t dst_stride,
 | //                             uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                             const int16_t *filter_x, int x_step_q4,
 | //                             const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                             const int16_t *filter_y, int y_step_q4,
 | //                             const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -78,33 +78,33 @@ FUN_CONV_2D(, sse2); | |||||||
| FUN_CONV_2D(avg_ , sse2); | FUN_CONV_2D(avg_ , sse2); | ||||||
| 
 | 
 | ||||||
| #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 | #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; | ||||||
| 
 | 
 | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; | ||||||
| highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; | ||||||
| 
 | 
 | ||||||
| // void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src,
 | // void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
 | ||||||
| //                                      ptrdiff_t src_stride,
 | //                                      ptrdiff_t src_stride,
 | ||||||
| //                                      uint8_t *dst,
 | //                                      uint8_t *dst,
 | ||||||
| //                                      ptrdiff_t dst_stride,
 | //                                      ptrdiff_t dst_stride,
 | ||||||
| @@ -113,7 +113,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; | |||||||
| //                                      const int16_t *filter_y,
 | //                                      const int16_t *filter_y,
 | ||||||
| //                                      int y_step_q4,
 | //                                      int y_step_q4,
 | ||||||
| //                                      int w, int h, int bd);
 | //                                      int w, int h, int bd);
 | ||||||
| // void vp9_highbd_convolve8_vert_sse2(const uint8_t *src,
 | // void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
 | ||||||
| //                                     ptrdiff_t src_stride,
 | //                                     ptrdiff_t src_stride,
 | ||||||
| //                                     uint8_t *dst,
 | //                                     uint8_t *dst,
 | ||||||
| //                                     ptrdiff_t dst_stride,
 | //                                     ptrdiff_t dst_stride,
 | ||||||
| @@ -122,7 +122,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; | |||||||
| //                                     const int16_t *filter_y,
 | //                                     const int16_t *filter_y,
 | ||||||
| //                                     int y_step_q4,
 | //                                     int y_step_q4,
 | ||||||
| //                                     int w, int h, int bd);
 | //                                     int w, int h, int bd);
 | ||||||
| // void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
 | // void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
 | ||||||
| //                                          ptrdiff_t src_stride,
 | //                                          ptrdiff_t src_stride,
 | ||||||
| //                                          uint8_t *dst,
 | //                                          uint8_t *dst,
 | ||||||
| //                                          ptrdiff_t dst_stride,
 | //                                          ptrdiff_t dst_stride,
 | ||||||
| @@ -131,7 +131,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; | |||||||
| //                                          const int16_t *filter_y,
 | //                                          const int16_t *filter_y,
 | ||||||
| //                                          int y_step_q4,
 | //                                          int y_step_q4,
 | ||||||
| //                                          int w, int h, int bd);
 | //                                          int w, int h, int bd);
 | ||||||
| // void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
 | // void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
 | ||||||
| //                                         ptrdiff_t src_stride,
 | //                                         ptrdiff_t src_stride,
 | ||||||
| //                                         uint8_t *dst,
 | //                                         uint8_t *dst,
 | ||||||
| //                                         ptrdiff_t dst_stride,
 | //                                         ptrdiff_t dst_stride,
 | ||||||
| @@ -146,12 +146,12 @@ HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); | |||||||
| HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, | HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, | ||||||
|                  sse2); |                  sse2); | ||||||
| 
 | 
 | ||||||
| // void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                uint8_t *dst, ptrdiff_t dst_stride,
 | //                                uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                const int16_t *filter_x, int x_step_q4,
 | //                                const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                const int16_t *filter_y, int y_step_q4,
 | //                                const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                                int w, int h, int bd);
 | //                                int w, int h, int bd);
 | ||||||
| // void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                    uint8_t *dst, ptrdiff_t dst_stride,
 | //                                    uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                    const int16_t *filter_x, int x_step_q4,
 | //                                    const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                    const int16_t *filter_y, int y_step_q4,
 | //                                    const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -8,6 +8,8 @@ | |||||||
| ;  be found in the AUTHORS file in the root of the source tree. | ;  be found in the AUTHORS file in the root of the source tree. | ||||||
| ; | ; | ||||||
| 
 | 
 | ||||||
|  | %define program_name vpx | ||||||
|  | 
 | ||||||
| %include "third_party/x86inc/x86inc.asm" | %include "third_party/x86inc/x86inc.asm" | ||||||
| 
 | 
 | ||||||
| SECTION .text | SECTION .text | ||||||
| @@ -197,7 +197,7 @@ | |||||||
|     movdqu      [rdi + %2], xmm0 |     movdqu      [rdi + %2], xmm0 | ||||||
| %endm | %endm | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d4_v8_sse2 | ;void vpx_filter_block1d4_v8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -206,8 +206,8 @@ | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_v8_sse2): | sym(vpx_highbd_filter_block1d4_v8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -267,7 +267,7 @@ sym(vp9_highbd_filter_block1d4_v8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_v8_sse2 | ;void vpx_filter_block1d8_v8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -276,8 +276,8 @@ sym(vp9_highbd_filter_block1d4_v8_sse2): | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_v8_sse2): | sym(vpx_highbd_filter_block1d8_v8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -326,7 +326,7 @@ sym(vp9_highbd_filter_block1d8_v8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d16_v8_sse2 | ;void vpx_filter_block1d16_v8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -335,8 +335,8 @@ sym(vp9_highbd_filter_block1d8_v8_sse2): | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_v8_sse2): | sym(vpx_highbd_filter_block1d16_v8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -389,8 +389,8 @@ sym(vp9_highbd_filter_block1d16_v8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_v8_avg_sse2): | sym(vpx_highbd_filter_block1d4_v8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -450,8 +450,8 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_v8_avg_sse2): | sym(vpx_highbd_filter_block1d8_v8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -499,8 +499,8 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_v8_avg_sse2): | sym(vpx_highbd_filter_block1d16_v8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -552,7 +552,7 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d4_h8_sse2 | ;void vpx_filter_block1d4_h8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -561,8 +561,8 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_h8_sse2): | sym(vpx_highbd_filter_block1d4_h8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -627,7 +627,7 @@ sym(vp9_highbd_filter_block1d4_h8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_h8_sse2 | ;void vpx_filter_block1d8_h8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -636,8 +636,8 @@ sym(vp9_highbd_filter_block1d4_h8_sse2): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_h8_sse2): | sym(vpx_highbd_filter_block1d8_h8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -693,7 +693,7 @@ sym(vp9_highbd_filter_block1d8_h8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d16_h8_sse2 | ;void vpx_filter_block1d16_h8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -702,8 +702,8 @@ sym(vp9_highbd_filter_block1d8_h8_sse2): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_h8_sse2): | sym(vpx_highbd_filter_block1d16_h8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -770,8 +770,8 @@ sym(vp9_highbd_filter_block1d16_h8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_h8_avg_sse2): | sym(vpx_highbd_filter_block1d4_h8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -836,8 +836,8 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_h8_avg_sse2): | sym(vpx_highbd_filter_block1d8_h8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -893,8 +893,8 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_h8_avg_sse2): | sym(vpx_highbd_filter_block1d16_h8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -171,8 +171,8 @@ | |||||||
| %endm | %endm | ||||||
| %endif | %endif | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_v2_sse2): | sym(vpx_highbd_filter_block1d4_v2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -196,8 +196,8 @@ sym(vp9_highbd_filter_block1d4_v2_sse2): | |||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||||
| global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_v2_sse2): | sym(vpx_highbd_filter_block1d8_v2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -222,8 +222,8 @@ sym(vp9_highbd_filter_block1d8_v2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_v2_sse2): | sym(vpx_highbd_filter_block1d16_v2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -251,8 +251,8 @@ sym(vp9_highbd_filter_block1d16_v2_sse2): | |||||||
|     ret |     ret | ||||||
| %endif | %endif | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_v2_avg_sse2): | sym(vpx_highbd_filter_block1d4_v2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -276,8 +276,8 @@ sym(vp9_highbd_filter_block1d4_v2_avg_sse2): | |||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||||
| global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_v2_avg_sse2): | sym(vpx_highbd_filter_block1d8_v2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -302,8 +302,8 @@ sym(vp9_highbd_filter_block1d8_v2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_v2_avg_sse2): | sym(vpx_highbd_filter_block1d16_v2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -331,8 +331,8 @@ sym(vp9_highbd_filter_block1d16_v2_avg_sse2): | |||||||
|     ret |     ret | ||||||
| %endif | %endif | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_h2_sse2): | sym(vpx_highbd_filter_block1d4_h2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -357,8 +357,8 @@ sym(vp9_highbd_filter_block1d4_h2_sse2): | |||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||||
| global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_h2_sse2): | sym(vpx_highbd_filter_block1d8_h2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -383,8 +383,8 @@ sym(vp9_highbd_filter_block1d8_h2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_h2_sse2): | sym(vpx_highbd_filter_block1d16_h2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -412,8 +412,8 @@ sym(vp9_highbd_filter_block1d16_h2_sse2): | |||||||
|     ret |     ret | ||||||
| %endif | %endif | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d4_h2_avg_sse2): | sym(vpx_highbd_filter_block1d4_h2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -438,8 +438,8 @@ sym(vp9_highbd_filter_block1d4_h2_avg_sse2): | |||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||||
| global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d8_h2_avg_sse2): | sym(vpx_highbd_filter_block1d8_h2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -464,8 +464,8 @@ sym(vp9_highbd_filter_block1d8_h2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE | global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE | ||||||
| sym(vp9_highbd_filter_block1d16_h2_avg_sse2): | sym(vpx_highbd_filter_block1d16_h2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 7 |     SHADOW_ARGS_TO_STACK 7 | ||||||
| @@ -11,11 +11,11 @@ | |||||||
| // Due to a header conflict between math.h and intrinsics includes with ceil()
 | // Due to a header conflict between math.h and intrinsics includes with ceil()
 | ||||||
| // in certain configurations under vs9 this include needs to precede
 | // in certain configurations under vs9 this include needs to precede
 | ||||||
| // immintrin.h.
 | // immintrin.h.
 | ||||||
| #include "./vp9_rtcd.h" |  | ||||||
| 
 | 
 | ||||||
| #include <immintrin.h> | #include <immintrin.h> | ||||||
| 
 | 
 | ||||||
| #include "vp9/common/x86/convolve.h" | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx_dsp/x86/convolve.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| 
 | 
 | ||||||
| // filters for 16_h8 and 16_v8
 | // filters for 16_h8 and 16_v8
 | ||||||
| @@ -60,7 +60,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { | |||||||
| # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) | # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) | ||||||
| #endif  // __clang__
 | #endif  // __clang__
 | ||||||
| 
 | 
 | ||||||
| static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, | static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, | ||||||
|                                          ptrdiff_t src_pixels_per_line, |                                          ptrdiff_t src_pixels_per_line, | ||||||
|                                          uint8_t *output_ptr, |                                          uint8_t *output_ptr, | ||||||
|                                          ptrdiff_t output_pitch, |                                          ptrdiff_t output_pitch, | ||||||
| @@ -304,7 +304,7 @@ static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, | static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, | ||||||
|                                          ptrdiff_t src_pitch, |                                          ptrdiff_t src_pitch, | ||||||
|                                          uint8_t *output_ptr, |                                          uint8_t *output_ptr, | ||||||
|                                          ptrdiff_t out_pitch, |                                          ptrdiff_t out_pitch, | ||||||
| @@ -551,41 +551,41 @@ static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #if HAVE_AVX2 && HAVE_SSSE3 | #if HAVE_AVX2 && HAVE_SSSE3 | ||||||
| filter8_1dfunction vp9_filter_block1d4_v8_ssse3; | filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | ||||||
| #if ARCH_X86_64 | #if ARCH_X86_64 | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; | ||||||
| #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 | #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 | ||||||
| #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 | #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 | ||||||
| #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 | #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 | ||||||
| #else  // ARCH_X86
 | #else  // ARCH_X86
 | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_ssse3; | filter8_1dfunction vpx_filter_block1d8_v8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_ssse3; | filter8_1dfunction vpx_filter_block1d8_h8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_ssse3; | filter8_1dfunction vpx_filter_block1d4_h8_ssse3; | ||||||
| #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 | #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 | ||||||
| #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 | #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 | ||||||
| #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 | #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 | ||||||
| #endif  // ARCH_X86_64
 | #endif  // ARCH_X86_64
 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v2_ssse3; | filter8_1dfunction vpx_filter_block1d16_v2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h2_ssse3; | filter8_1dfunction vpx_filter_block1d16_h2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v2_ssse3; | filter8_1dfunction vpx_filter_block1d8_v2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h2_ssse3; | filter8_1dfunction vpx_filter_block1d8_h2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v2_ssse3; | filter8_1dfunction vpx_filter_block1d4_v2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h2_ssse3; | filter8_1dfunction vpx_filter_block1d4_h2_ssse3; | ||||||
| #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 | #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 | ||||||
| #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 | #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 | ||||||
| #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 | #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 | ||||||
| #define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3 | #define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3 | ||||||
| #define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3 | #define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3 | ||||||
| #define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3 | #define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3 | ||||||
| #define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3 | #define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3 | ||||||
| // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                uint8_t *dst, ptrdiff_t dst_stride,
 | //                                uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                const int16_t *filter_x, int x_step_q4,
 | //                                const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                const int16_t *filter_y, int y_step_q4,
 | //                                const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                                int w, int h);
 | //                                int w, int h);
 | ||||||
| // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                               uint8_t *dst, ptrdiff_t dst_stride,
 | //                               uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                               const int16_t *filter_x, int x_step_q4,
 | //                               const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                               const int16_t *filter_y, int y_step_q4,
 | //                               const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -593,7 +593,7 @@ filter8_1dfunction vp9_filter_block1d4_h2_ssse3; | |||||||
| FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); | FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); | ||||||
| FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); | FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); | ||||||
| 
 | 
 | ||||||
| // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                          uint8_t *dst, ptrdiff_t dst_stride,
 | //                          uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                          const int16_t *filter_x, int x_step_q4,
 | //                          const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                          const int16_t *filter_y, int y_step_q4,
 | //                          const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -11,11 +11,11 @@ | |||||||
| // Due to a header conflict between math.h and intrinsics includes with ceil()
 | // Due to a header conflict between math.h and intrinsics includes with ceil()
 | ||||||
| // in certain configurations under vs9 this include needs to precede
 | // in certain configurations under vs9 this include needs to precede
 | ||||||
| // tmmintrin.h.
 | // tmmintrin.h.
 | ||||||
| #include "./vp9_rtcd.h" |  | ||||||
| 
 | 
 | ||||||
| #include <tmmintrin.h> | #include <tmmintrin.h> | ||||||
| 
 | 
 | ||||||
| #include "vp9/common/x86/convolve.h" | #include "./vpx_dsp_rtcd.h" | ||||||
|  | #include "vpx_dsp/x86/convolve.h" | ||||||
| #include "vpx_ports/mem.h" | #include "vpx_ports/mem.h" | ||||||
| #include "vpx_ports/emmintrin_compat.h" | #include "vpx_ports/emmintrin_compat.h" | ||||||
| 
 | 
 | ||||||
| @@ -46,11 +46,11 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { | |||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| // These are reused by the avx2 intrinsics.
 | // These are reused by the avx2 intrinsics.
 | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; | ||||||
| 
 | 
 | ||||||
| void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, | void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, | ||||||
|                                          ptrdiff_t src_pixels_per_line, |                                          ptrdiff_t src_pixels_per_line, | ||||||
|                                          uint8_t *output_ptr, |                                          uint8_t *output_ptr, | ||||||
|                                          ptrdiff_t output_pitch, |                                          ptrdiff_t output_pitch, | ||||||
| @@ -121,7 +121,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, | void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, | ||||||
|                                          ptrdiff_t src_pixels_per_line, |                                          ptrdiff_t src_pixels_per_line, | ||||||
|                                          uint8_t *output_ptr, |                                          uint8_t *output_ptr, | ||||||
|                                          ptrdiff_t output_pitch, |                                          ptrdiff_t output_pitch, | ||||||
| @@ -201,7 +201,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, | static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, | ||||||
|                                                  ptrdiff_t src_pixels_per_line, |                                                  ptrdiff_t src_pixels_per_line, | ||||||
|                                                  uint8_t *output_ptr, |                                                  uint8_t *output_ptr, | ||||||
|                                                  ptrdiff_t output_pitch, |                                                  ptrdiff_t output_pitch, | ||||||
| @@ -318,7 +318,7 @@ static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, | void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, | ||||||
|                                          ptrdiff_t src_pitch, |                                          ptrdiff_t src_pitch, | ||||||
|                                          uint8_t *output_ptr, |                                          uint8_t *output_ptr, | ||||||
|                                          ptrdiff_t out_pitch, |                                          ptrdiff_t out_pitch, | ||||||
| @@ -406,7 +406,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, | |||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, | static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, | ||||||
|                                                  ptrdiff_t src_pitch, |                                                  ptrdiff_t src_pitch, | ||||||
|                                                  uint8_t *output_ptr, |                                                  uint8_t *output_ptr, | ||||||
|                                                  ptrdiff_t out_pitch, |                                                  ptrdiff_t out_pitch, | ||||||
| @@ -522,61 +522,61 @@ static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #if ARCH_X86_64 | #if ARCH_X86_64 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v8_ssse3; | filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; | filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; | ||||||
| #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 | #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 | ||||||
| #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 | #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 | ||||||
| #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 | #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 | ||||||
| #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 | #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 | ||||||
| #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 | #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 | ||||||
| #else  // ARCH_X86
 | #else  // ARCH_X86
 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v8_ssse3; | filter8_1dfunction vpx_filter_block1d16_v8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h8_ssse3; | filter8_1dfunction vpx_filter_block1d16_h8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_ssse3; | filter8_1dfunction vpx_filter_block1d8_v8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_ssse3; | filter8_1dfunction vpx_filter_block1d8_h8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v8_ssse3; | filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_ssse3; | filter8_1dfunction vpx_filter_block1d4_h8_ssse3; | ||||||
| #endif  // ARCH_X86_64
 | #endif  // ARCH_X86_64
 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; | filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; | filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; | filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; | filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; | filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; | filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; | ||||||
| 
 | 
 | ||||||
| filter8_1dfunction vp9_filter_block1d16_v2_ssse3; | filter8_1dfunction vpx_filter_block1d16_v2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h2_ssse3; | filter8_1dfunction vpx_filter_block1d16_h2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v2_ssse3; | filter8_1dfunction vpx_filter_block1d8_v2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h2_ssse3; | filter8_1dfunction vpx_filter_block1d8_h2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v2_ssse3; | filter8_1dfunction vpx_filter_block1d4_v2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h2_ssse3; | filter8_1dfunction vpx_filter_block1d4_h2_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; | filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; | filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; | filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; | filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; | filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; | ||||||
| filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; | filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; | ||||||
| 
 | 
 | ||||||
| // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                uint8_t *dst, ptrdiff_t dst_stride,
 | //                                uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                const int16_t *filter_x, int x_step_q4,
 | //                                const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                const int16_t *filter_y, int y_step_q4,
 | //                                const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                                int w, int h);
 | //                                int w, int h);
 | ||||||
| // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                               uint8_t *dst, ptrdiff_t dst_stride,
 | //                               uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                               const int16_t *filter_x, int x_step_q4,
 | //                               const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                               const int16_t *filter_y, int y_step_q4,
 | //                               const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                               int w, int h);
 | //                               int w, int h);
 | ||||||
| // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                    uint8_t *dst, ptrdiff_t dst_stride,
 | //                                    uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                    const int16_t *filter_x, int x_step_q4,
 | //                                    const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                    const int16_t *filter_y, int y_step_q4,
 | //                                    const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                                    int w, int h);
 | //                                    int w, int h);
 | ||||||
| // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                                   uint8_t *dst, ptrdiff_t dst_stride,
 | //                                   uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                                   const int16_t *filter_x, int x_step_q4,
 | //                                   const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                                   const int16_t *filter_y, int y_step_q4,
 | //                                   const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -587,12 +587,12 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); | |||||||
| FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, | FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, | ||||||
|             ssse3); |             ssse3); | ||||||
| 
 | 
 | ||||||
| // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                          uint8_t *dst, ptrdiff_t dst_stride,
 | //                          uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                          const int16_t *filter_x, int x_step_q4,
 | //                          const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                          const int16_t *filter_y, int y_step_q4,
 | //                          const int16_t *filter_y, int y_step_q4,
 | ||||||
| //                          int w, int h);
 | //                          int w, int h);
 | ||||||
| // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 | ||||||
| //                              uint8_t *dst, ptrdiff_t dst_stride,
 | //                              uint8_t *dst, ptrdiff_t dst_stride,
 | ||||||
| //                              const int16_t *filter_x, int x_step_q4,
 | //                              const int16_t *filter_x, int x_step_q4,
 | ||||||
| //                              const int16_t *filter_y, int y_step_q4,
 | //                              const int16_t *filter_y, int y_step_q4,
 | ||||||
| @@ -176,7 +176,7 @@ | |||||||
|     movq        [rdi + %2], xmm0 |     movq        [rdi + %2], xmm0 | ||||||
| %endm | %endm | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d4_v8_sse2 | ;void vpx_filter_block1d4_v8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -185,8 +185,8 @@ | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d4_v8_sse2) PRIVATE | global sym(vpx_filter_block1d4_v8_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v8_sse2): | sym(vpx_filter_block1d4_v8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -243,7 +243,7 @@ sym(vp9_filter_block1d4_v8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_v8_sse2 | ;void vpx_filter_block1d8_v8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -252,8 +252,8 @@ sym(vp9_filter_block1d4_v8_sse2): | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d8_v8_sse2) PRIVATE | global sym(vpx_filter_block1d8_v8_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v8_sse2): | sym(vpx_filter_block1d8_v8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -302,7 +302,7 @@ sym(vp9_filter_block1d8_v8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d16_v8_sse2 | ;void vpx_filter_block1d16_v8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -311,8 +311,8 @@ sym(vp9_filter_block1d8_v8_sse2): | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d16_v8_sse2) PRIVATE | global sym(vpx_filter_block1d16_v8_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v8_sse2): | sym(vpx_filter_block1d16_v8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -365,8 +365,8 @@ sym(vp9_filter_block1d16_v8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE | global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v8_avg_sse2): | sym(vpx_filter_block1d4_v8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -423,8 +423,8 @@ sym(vp9_filter_block1d4_v8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE | global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v8_avg_sse2): | sym(vpx_filter_block1d8_v8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -472,8 +472,8 @@ sym(vp9_filter_block1d8_v8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE | global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v8_avg_sse2): | sym(vpx_filter_block1d16_v8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -525,7 +525,7 @@ sym(vp9_filter_block1d16_v8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d4_h8_sse2 | ;void vpx_filter_block1d4_h8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -534,8 +534,8 @@ sym(vp9_filter_block1d16_v8_avg_sse2): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d4_h8_sse2) PRIVATE | global sym(vpx_filter_block1d4_h8_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h8_sse2): | sym(vpx_filter_block1d4_h8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -599,7 +599,7 @@ sym(vp9_filter_block1d4_h8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_h8_sse2 | ;void vpx_filter_block1d8_h8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -608,8 +608,8 @@ sym(vp9_filter_block1d4_h8_sse2): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d8_h8_sse2) PRIVATE | global sym(vpx_filter_block1d8_h8_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h8_sse2): | sym(vpx_filter_block1d8_h8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -674,7 +674,7 @@ sym(vp9_filter_block1d8_h8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d16_h8_sse2 | ;void vpx_filter_block1d16_h8_sse2 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -683,8 +683,8 @@ sym(vp9_filter_block1d8_h8_sse2): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d16_h8_sse2) PRIVATE | global sym(vpx_filter_block1d16_h8_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h8_sse2): | sym(vpx_filter_block1d16_h8_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -769,8 +769,8 @@ sym(vp9_filter_block1d16_h8_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE | global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h8_avg_sse2): | sym(vpx_filter_block1d4_h8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -834,8 +834,8 @@ sym(vp9_filter_block1d4_h8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE | global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h8_avg_sse2): | sym(vpx_filter_block1d8_h8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -900,8 +900,8 @@ sym(vp9_filter_block1d8_h8_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE | global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h8_avg_sse2): | sym(vpx_filter_block1d16_h8_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -310,7 +310,7 @@ | |||||||
|     jnz         .loop |     jnz         .loop | ||||||
| %endm | %endm | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_v8_ssse3 | ;void vpx_filter_block1d8_v8_ssse3 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -319,8 +319,8 @@ | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE | global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v8_ssse3): | sym(vpx_filter_block1d4_v8_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -351,7 +351,7 @@ sym(vp9_filter_block1d4_v8_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_v8_ssse3 | ;void vpx_filter_block1d8_v8_ssse3 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -360,8 +360,8 @@ sym(vp9_filter_block1d4_v8_ssse3): | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE | global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v8_ssse3): | sym(vpx_filter_block1d8_v8_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -392,7 +392,7 @@ sym(vp9_filter_block1d8_v8_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d16_v8_ssse3 | ;void vpx_filter_block1d16_v8_ssse3 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char *src_ptr, | ;    unsigned char *src_ptr, | ||||||
| ;    unsigned int   src_pitch, | ;    unsigned int   src_pitch, | ||||||
| @@ -401,8 +401,8 @@ sym(vp9_filter_block1d8_v8_ssse3): | |||||||
| ;    unsigned int   output_height, | ;    unsigned int   output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE | global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v8_ssse3): | sym(vpx_filter_block1d16_v8_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -436,8 +436,8 @@ sym(vp9_filter_block1d16_v8_ssse3): | |||||||
| ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v8_avg_ssse3): | sym(vpx_filter_block1d4_v8_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -468,8 +468,8 @@ sym(vp9_filter_block1d4_v8_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v8_avg_ssse3): | sym(vpx_filter_block1d8_v8_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -500,8 +500,8 @@ sym(vp9_filter_block1d8_v8_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v8_avg_ssse3): | sym(vpx_filter_block1d16_v8_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -838,7 +838,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): | |||||||
|     jnz         .loop |     jnz         .loop | ||||||
| %endm | %endm | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d4_h8_ssse3 | ;void vpx_filter_block1d4_h8_ssse3 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -847,8 +847,8 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE | global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h8_ssse3): | sym(vpx_filter_block1d4_h8_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -877,7 +877,7 @@ sym(vp9_filter_block1d4_h8_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d8_h8_ssse3 | ;void vpx_filter_block1d8_h8_ssse3 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -886,8 +886,8 @@ sym(vp9_filter_block1d4_h8_ssse3): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE | global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h8_ssse3): | sym(vpx_filter_block1d8_h8_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -919,7 +919,7 @@ sym(vp9_filter_block1d8_h8_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| ;void vp9_filter_block1d16_h8_ssse3 | ;void vpx_filter_block1d16_h8_ssse3 | ||||||
| ;( | ;( | ||||||
| ;    unsigned char  *src_ptr, | ;    unsigned char  *src_ptr, | ||||||
| ;    unsigned int    src_pixels_per_line, | ;    unsigned int    src_pixels_per_line, | ||||||
| @@ -928,8 +928,8 @@ sym(vp9_filter_block1d8_h8_ssse3): | |||||||
| ;    unsigned int    output_height, | ;    unsigned int    output_height, | ||||||
| ;    short *filter | ;    short *filter | ||||||
| ;) | ;) | ||||||
| global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE | global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h8_ssse3): | sym(vpx_filter_block1d16_h8_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -961,8 +961,8 @@ sym(vp9_filter_block1d16_h8_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h8_avg_ssse3): | sym(vpx_filter_block1d4_h8_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -991,8 +991,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h8_avg_ssse3): | sym(vpx_filter_block1d8_h8_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -1024,8 +1024,8 @@ sym(vp9_filter_block1d8_h8_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h8_avg_ssse3): | sym(vpx_filter_block1d16_h8_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -131,8 +131,8 @@ | |||||||
|     dec         rcx |     dec         rcx | ||||||
| %endm | %endm | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_v2_sse2) PRIVATE | global sym(vpx_filter_block1d4_v2_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v2_sse2): | sym(vpx_filter_block1d4_v2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -155,8 +155,8 @@ sym(vp9_filter_block1d4_v2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_v2_sse2) PRIVATE | global sym(vpx_filter_block1d8_v2_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v2_sse2): | sym(vpx_filter_block1d8_v2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -181,8 +181,8 @@ sym(vp9_filter_block1d8_v2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_v2_sse2) PRIVATE | global sym(vpx_filter_block1d16_v2_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v2_sse2): | sym(vpx_filter_block1d16_v2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -209,8 +209,8 @@ sym(vp9_filter_block1d16_v2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE | global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v2_avg_sse2): | sym(vpx_filter_block1d4_v2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -233,8 +233,8 @@ sym(vp9_filter_block1d4_v2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE | global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v2_avg_sse2): | sym(vpx_filter_block1d8_v2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -259,8 +259,8 @@ sym(vp9_filter_block1d8_v2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE | global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v2_avg_sse2): | sym(vpx_filter_block1d16_v2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -287,8 +287,8 @@ sym(vp9_filter_block1d16_v2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_h2_sse2) PRIVATE | global sym(vpx_filter_block1d4_h2_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h2_sse2): | sym(vpx_filter_block1d4_h2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -312,8 +312,8 @@ sym(vp9_filter_block1d4_h2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_h2_sse2) PRIVATE | global sym(vpx_filter_block1d8_h2_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h2_sse2): | sym(vpx_filter_block1d8_h2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -339,8 +339,8 @@ sym(vp9_filter_block1d8_h2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_h2_sse2) PRIVATE | global sym(vpx_filter_block1d16_h2_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h2_sse2): | sym(vpx_filter_block1d16_h2_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -367,8 +367,8 @@ sym(vp9_filter_block1d16_h2_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE | global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h2_avg_sse2): | sym(vpx_filter_block1d4_h2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -392,8 +392,8 @@ sym(vp9_filter_block1d4_h2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE | global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h2_avg_sse2): | sym(vpx_filter_block1d8_h2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -419,8 +419,8 @@ sym(vp9_filter_block1d8_h2_avg_sse2): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE | global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h2_avg_sse2): | sym(vpx_filter_block1d16_h2_avg_sse2): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -109,8 +109,8 @@ | |||||||
|     dec         rcx |     dec         rcx | ||||||
| %endm | %endm | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE | global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v2_ssse3): | sym(vpx_filter_block1d4_v2_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -133,8 +133,8 @@ sym(vp9_filter_block1d4_v2_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE | global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v2_ssse3): | sym(vpx_filter_block1d8_v2_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -159,8 +159,8 @@ sym(vp9_filter_block1d8_v2_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE | global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v2_ssse3): | sym(vpx_filter_block1d16_v2_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -186,8 +186,8 @@ sym(vp9_filter_block1d16_v2_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_v2_avg_ssse3): | sym(vpx_filter_block1d4_v2_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -210,8 +210,8 @@ sym(vp9_filter_block1d4_v2_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_v2_avg_ssse3): | sym(vpx_filter_block1d8_v2_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -236,8 +236,8 @@ sym(vp9_filter_block1d8_v2_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_v2_avg_ssse3): | sym(vpx_filter_block1d16_v2_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -263,8 +263,8 @@ sym(vp9_filter_block1d16_v2_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE | global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h2_ssse3): | sym(vpx_filter_block1d4_h2_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -288,8 +288,8 @@ sym(vp9_filter_block1d4_h2_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE | global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h2_ssse3): | sym(vpx_filter_block1d8_h2_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -315,8 +315,8 @@ sym(vp9_filter_block1d8_h2_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE | global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h2_ssse3): | sym(vpx_filter_block1d16_h2_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -342,8 +342,8 @@ sym(vp9_filter_block1d16_h2_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d4_h2_avg_ssse3): | sym(vpx_filter_block1d4_h2_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -367,8 +367,8 @@ sym(vp9_filter_block1d4_h2_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d8_h2_avg_ssse3): | sym(vpx_filter_block1d8_h2_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
| @@ -394,8 +394,8 @@ sym(vp9_filter_block1d8_h2_avg_ssse3): | |||||||
|     pop         rbp |     pop         rbp | ||||||
|     ret |     ret | ||||||
| 
 | 
 | ||||||
| global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE | global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE | ||||||
| sym(vp9_filter_block1d16_h2_avg_ssse3): | sym(vpx_filter_block1d16_h2_avg_ssse3): | ||||||
|     push        rbp |     push        rbp | ||||||
|     mov         rbp, rsp |     mov         rbp, rsp | ||||||
|     SHADOW_ARGS_TO_STACK 6 |     SHADOW_ARGS_TO_STACK 6 | ||||||
		Reference in New Issue
	
	Block a user
	 Zoe Liu
					Zoe Liu