vpx/vp9/common/x86/vp9_asm_stubs.c

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <assert.h>

#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vpx_ports/mem.h"
///////////////////////////////////////////////////////////////////////////
// the mmx function that does the bilinear filtering and var calculation //
// int one pass                                                          //
///////////////////////////////////////////////////////////////////////////
DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
  { 128, 128, 128, 128,  0,  0,  0,  0 },
  { 120, 120, 120, 120,  8,  8,  8,  8 },
  { 112, 112, 112, 112, 16, 16, 16, 16 },
  { 104, 104, 104, 104, 24, 24, 24, 24 },
  {  96, 96, 96, 96, 32, 32, 32, 32 },
  {  88, 88, 88, 88, 40, 40, 40, 40 },
  {  80, 80, 80, 80, 48, 48, 48, 48 },
  {  72, 72, 72, 72, 56, 56, 56, 56 },
  {  64, 64, 64, 64, 64, 64, 64, 64 },
  {  56, 56, 56, 56, 72, 72, 72, 72 },
  {  48, 48, 48, 48, 80, 80, 80, 80 },
  {  40, 40, 40, 40, 88, 88, 88, 88 },
  {  32, 32, 32, 32, 96, 96, 96, 96 },
  {  24, 24, 24, 24, 104, 104, 104, 104 },
  {  16, 16, 16, 16, 112, 112, 112, 112 },
  {   8,  8,  8,  8, 120, 120, 120, 120 }
};

#if HAVE_SSSE3
void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);

void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);

void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);

void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
                                   const unsigned int src_pitch,
                                   unsigned char *output_ptr,
                                   unsigned int out_pitch,
                                   unsigned int output_height,
                                   const short *filter);

void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
  if (x_step_q4 == 16 && filter_x[3] != 128) {
    while (w >= 16) {
      vp9_filter_block1d16_h8_ssse3(src, src_stride,
                                    dst, dst_stride,
                                    h, filter_x);
      src += 16;
      dst += 16;
      w -= 16;
    }
    while (w >= 8) {
      vp9_filter_block1d8_h8_ssse3(src, src_stride,
                                   dst, dst_stride,
                                   h, filter_x);
      src += 8;
      dst += 8;
      w -= 8;
    }
  }
  if (w) {
    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                          filter_x, x_step_q4, filter_y, y_step_q4,
                          w, h);
  }
}

void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
  if (y_step_q4 == 16 && filter_y[3] != 128) {
    while (w >= 16) {
      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
                                    dst, dst_stride,
                                    h, filter_y);
      src += 16;
      dst += 16;
      w -= 16;
    }
    while (w >= 8) {
      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
                                   dst, dst_stride,
                                   h, filter_y);
      src += 8;
      dst += 8;
      w -= 8;
    }
  }
  if (w) {
    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
  }
}

void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                         uint8_t *dst, int dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);

  // check w/h due to fixed size fdata2 array
  assert(w <= 16);
  assert(h <= 16);

  if (x_step_q4 == 16 && y_step_q4 == 16 &&
      filter_x[3] != 128 && filter_y[3] != 128) {
    if (w == 16) {
      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
                                    fdata2, 16,
                                    h + 7, filter_x);
      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
                                    dst, dst_stride,
                                    h, filter_y);
      return;
    }
    if (w == 8) {
      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
                                   fdata2, 16,
                                   h + 7, filter_x);
      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
                                   dst, dst_stride,
                                   h, filter_y);
      return;
    }
  }
  vp9_convolve8_c(src, src_stride, dst, dst_stride,
                  filter_x, x_step_q4, filter_y, y_step_q4,
                  w, h);
}
#endif
Initial WebM release 2010-05-18 17:58:33 +02:00			`/*`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`* Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`*`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`* Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`* in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`* be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`*/`

Restore SSSE3 subpixel filters in new convolve framework This commit adds the 8 tap SSSE3 subpixel filters back into the code underneath the convolve API. The C code is still called for 4x4 blocks, as well as compound prediction modes. This restores the encode performance to be within about 8% of the baseline. Change-Id: Ife0d81477075ae33c05b53c65003951efdc8b09c 2013-02-08 02:00:37 +01:00			`#include <assert.h>`
Initial WebM release 2010-05-18 17:58:33 +02:00
Build fixes to merge vp9-preview into master Various fixups to resolve issues when building vp9-preview under the more stringent checks placed on the experimental branch. Change-Id: I21749de83552e1e75c799003f849e6a0f1a35b07 2012-12-23 16:20:10 +01:00			`#include "./vpx_config.h"`
Restore SSSE3 subpixel filters in new convolve framework This commit adds the 8 tap SSSE3 subpixel filters back into the code underneath the convolve API. The C code is still called for 4x4 blocks, as well as compound prediction modes. This restores the encode performance to be within about 8% of the baseline. Change-Id: Ife0d81477075ae33c05b53c65003951efdc8b09c 2013-02-08 02:00:37 +01:00			`#include "./vp9_rtcd.h"`
Initial WebM release 2010-05-18 17:58:33 +02:00			`#include "vpx_ports/mem.h"`
fixes --disable-vp9-encoder Change-Id: I467bf0fdf3b35326bcce58d5459e6d2dbfd6c5e5 2012-12-03 21:21:16 +01:00			`///////////////////////////////////////////////////////////////////////////`
			`// the mmx function that does the bilinear filtering and var calculation //`
			`// int one pass //`
			`///////////////////////////////////////////////////////////////////////////`
			`DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {`
			`{ 128, 128, 128, 128, 0, 0, 0, 0 },`
			`{ 120, 120, 120, 120, 8, 8, 8, 8 },`
			`{ 112, 112, 112, 112, 16, 16, 16, 16 },`
			`{ 104, 104, 104, 104, 24, 24, 24, 24 },`
			`{ 96, 96, 96, 96, 32, 32, 32, 32 },`
			`{ 88, 88, 88, 88, 40, 40, 40, 40 },`
			`{ 80, 80, 80, 80, 48, 48, 48, 48 },`
			`{ 72, 72, 72, 72, 56, 56, 56, 56 },`
			`{ 64, 64, 64, 64, 64, 64, 64, 64 },`
			`{ 56, 56, 56, 56, 72, 72, 72, 72 },`
			`{ 48, 48, 48, 48, 80, 80, 80, 80 },`
			`{ 40, 40, 40, 40, 88, 88, 88, 88 },`
			`{ 32, 32, 32, 32, 96, 96, 96, 96 },`
			`{ 24, 24, 24, 24, 104, 104, 104, 104 },`
			`{ 16, 16, 16, 16, 112, 112, 112, 112 },`
			`{ 8, 8, 8, 8, 120, 120, 120, 120 }`
			`};`
Restore SSSE3 subpixel filters in new convolve framework This commit adds the 8 tap SSSE3 subpixel filters back into the code underneath the convolve API. The C code is still called for 4x4 blocks, as well as compound prediction modes. This restores the encode performance to be within about 8% of the baseline. Change-Id: Ife0d81477075ae33c05b53c65003951efdc8b09c 2013-02-08 02:00:37 +01:00
			`#if HAVE_SSSE3`
			`void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,`
			`const unsigned int src_pitch,`
			`unsigned char *output_ptr,`
			`unsigned int out_pitch,`
			`unsigned int output_height,`
			`const short *filter);`

			`void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,`
			`const unsigned int src_pitch,`
			`unsigned char *output_ptr,`
			`unsigned int out_pitch,`
			`unsigned int output_height,`
			`const short *filter);`

			`void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,`
			`const unsigned int src_pitch,`
			`unsigned char *output_ptr,`
			`unsigned int out_pitch,`
			`unsigned int output_height,`
			`const short *filter);`

			`void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,`
			`const unsigned int src_pitch,`
			`unsigned char *output_ptr,`
			`unsigned int out_pitch,`
			`unsigned int output_height,`
			`const short *filter);`

			`void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,`
			`uint8_t *dst, int dst_stride,`
			`const int16_t *filter_x, int x_step_q4,`
			`const int16_t *filter_y, int y_step_q4,`
			`int w, int h) {`
			`if (x_step_q4 == 16 && filter_x[3] != 128) {`
			`while (w >= 16) {`
			`vp9_filter_block1d16_h8_ssse3(src, src_stride,`
			`dst, dst_stride,`
			`h, filter_x);`
			`src += 16;`
			`dst += 16;`
			`w -= 16;`
			`}`
			`while (w >= 8) {`
			`vp9_filter_block1d8_h8_ssse3(src, src_stride,`
			`dst, dst_stride,`
			`h, filter_x);`
			`src += 8;`
			`dst += 8;`
			`w -= 8;`
			`}`
			`}`
			`if (w) {`
			`vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,`
			`filter_x, x_step_q4, filter_y, y_step_q4,`
			`w, h);`
			`}`
			`}`

			`void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,`
			`uint8_t *dst, int dst_stride,`
			`const int16_t *filter_x, int x_step_q4,`
			`const int16_t *filter_y, int y_step_q4,`
			`int w, int h) {`
			`if (y_step_q4 == 16 && filter_y[3] != 128) {`
			`while (w >= 16) {`
			`vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,`
			`dst, dst_stride,`
			`h, filter_y);`
			`src += 16;`
			`dst += 16;`
			`w -= 16;`
			`}`
			`while (w >= 8) {`
			`vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,`
			`dst, dst_stride,`
			`h, filter_y);`
			`src += 8;`
			`dst += 8;`
			`w -= 8;`
			`}`
			`}`
			`if (w) {`
			`vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,`
			`filter_x, x_step_q4, filter_y, y_step_q4,`
			`w, h);`
			`}`
			`}`

			`void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,`
			`uint8_t *dst, int dst_stride,`
			`const int16_t *filter_x, int x_step_q4,`
			`const int16_t *filter_y, int y_step_q4,`
			`int w, int h) {`
			`DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);`

			`// check w/h due to fixed size fdata2 array`
			`assert(w <= 16);`
			`assert(h <= 16);`

			`if (x_step_q4 == 16 && y_step_q4 == 16 &&`
			`filter_x[3] != 128 && filter_y[3] != 128) {`
			`if (w == 16) {`
			`vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,`
			`fdata2, 16,`
			`h + 7, filter_x);`
			`vp9_filter_block1d16_v8_ssse3(fdata2, 16,`
			`dst, dst_stride,`
			`h, filter_y);`
			`return;`
			`}`
			`if (w == 8) {`
			`vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,`
			`fdata2, 16,`
			`h + 7, filter_x);`
			`vp9_filter_block1d8_v8_ssse3(fdata2, 16,`
			`dst, dst_stride,`
			`h, filter_y);`
			`return;`
			`}`
			`}`
			`vp9_convolve8_c(src, src_stride, dst, dst_stride,`
			`filter_x, x_step_q4, filter_y, y_step_q4,`
			`w, h);`
			`}`
			`#endif`