Merge changes Ife0d8147,I7d469716,Ic9a5615f into experimental
* changes: Restore SSSE3 subpixel filters in new convolve framework Convert subpixel filters to use convolve framework Add 8-tap generic convolver
This commit is contained in:
		
							
								
								
									
										527
									
								
								test/convolve_test.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										527
									
								
								test/convolve_test.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,527 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *  Use of this source code is governed by a BSD-style license
 | 
				
			||||||
 | 
					 *  that can be found in the LICENSE file in the root of the source
 | 
				
			||||||
 | 
					 *  tree. An additional intellectual property rights grant can be found
 | 
				
			||||||
 | 
					 *  in the file PATENTS.  All contributing project authors may
 | 
				
			||||||
 | 
					 *  be found in the AUTHORS file in the root of the source tree.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern "C" {
 | 
				
			||||||
 | 
					#include "./vpx_config.h"
 | 
				
			||||||
 | 
					#include "./vp9_rtcd.h"
 | 
				
			||||||
 | 
					#include "vpx_mem/vpx_mem.h"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#include "third_party/googletest/src/include/gtest/gtest.h"
 | 
				
			||||||
 | 
					#include "test/acm_random.h"
 | 
				
			||||||
 | 
					#include "test/register_state_check.h"
 | 
				
			||||||
 | 
					#include "test/util.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace {
 | 
				
			||||||
 | 
					typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                              uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                              const int16_t *filter_x, int filter_x_stride,
 | 
				
			||||||
 | 
					                              const int16_t *filter_y, int filter_y_stride,
 | 
				
			||||||
 | 
					                              int w, int h);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct ConvolveFunctions {
 | 
				
			||||||
 | 
					  ConvolveFunctions(convolve_fn_t h8, convolve_fn_t h8_avg,
 | 
				
			||||||
 | 
					                    convolve_fn_t v8, convolve_fn_t v8_avg,
 | 
				
			||||||
 | 
					                    convolve_fn_t hv8, convolve_fn_t hv8_avg)
 | 
				
			||||||
 | 
					      : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),
 | 
				
			||||||
 | 
					        hv8_avg_(hv8_avg) {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  convolve_fn_t h8_;
 | 
				
			||||||
 | 
					  convolve_fn_t v8_;
 | 
				
			||||||
 | 
					  convolve_fn_t hv8_;
 | 
				
			||||||
 | 
					  convolve_fn_t h8_avg_;
 | 
				
			||||||
 | 
					  convolve_fn_t v8_avg_;
 | 
				
			||||||
 | 
					  convolve_fn_t hv8_avg_;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Reference 8-tap subpixel filter, slightly modified to fit into this test.
 | 
				
			||||||
 | 
					#define VP9_FILTER_WEIGHT 128
 | 
				
			||||||
 | 
					#define VP9_FILTER_SHIFT 7
 | 
				
			||||||
 | 
					static uint8_t clip_pixel(int x) {
 | 
				
			||||||
 | 
					  return x < 0 ? 0 :
 | 
				
			||||||
 | 
					         x > 255 ? 255 :
 | 
				
			||||||
 | 
					         x;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void filter_block2d_8_c(const uint8_t *src_ptr,
 | 
				
			||||||
 | 
					                               const unsigned int src_stride,
 | 
				
			||||||
 | 
					                               const int16_t *HFilter,
 | 
				
			||||||
 | 
					                               const int16_t *VFilter,
 | 
				
			||||||
 | 
					                               uint8_t *dst_ptr,
 | 
				
			||||||
 | 
					                               unsigned int dst_stride,
 | 
				
			||||||
 | 
					                               unsigned int output_width,
 | 
				
			||||||
 | 
					                               unsigned int output_height) {
 | 
				
			||||||
 | 
					  // Between passes, we use an intermediate buffer whose height is extended to
 | 
				
			||||||
 | 
					  // have enough horizontally filtered values as input for the vertical pass.
 | 
				
			||||||
 | 
					  // This buffer is allocated to be big enough for the largest block type we
 | 
				
			||||||
 | 
					  // support.
 | 
				
			||||||
 | 
					  const int kInterp_Extend = 4;
 | 
				
			||||||
 | 
					  const unsigned int intermediate_height =
 | 
				
			||||||
 | 
					    (kInterp_Extend - 1) +     output_height + kInterp_Extend;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
 | 
				
			||||||
 | 
					   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
 | 
				
			||||||
 | 
					   *                                 + kInterp_Extend
 | 
				
			||||||
 | 
					   *                               = 3 + 16 + 4
 | 
				
			||||||
 | 
					   *                               = 23
 | 
				
			||||||
 | 
					   * and filter_max_width = 16
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  uint8_t intermediate_buffer[23 * 16];
 | 
				
			||||||
 | 
					  const int intermediate_next_stride = 1 - intermediate_height * output_width;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Horizontal pass (src -> transposed intermediate).
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    uint8_t *output_ptr = intermediate_buffer;
 | 
				
			||||||
 | 
					    const int src_next_row_stride = src_stride - output_width;
 | 
				
			||||||
 | 
					    unsigned int i, j;
 | 
				
			||||||
 | 
					    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
 | 
				
			||||||
 | 
					    for (i = 0; i < intermediate_height; ++i) {
 | 
				
			||||||
 | 
					      for (j = 0; j < output_width; ++j) {
 | 
				
			||||||
 | 
					        // Apply filter...
 | 
				
			||||||
 | 
					        int temp = ((int)src_ptr[0] * HFilter[0]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[1] * HFilter[1]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[2] * HFilter[2]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[3] * HFilter[3]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[4] * HFilter[4]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[5] * HFilter[5]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[6] * HFilter[6]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[7] * HFilter[7]) +
 | 
				
			||||||
 | 
					                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Normalize back to 0-255...
 | 
				
			||||||
 | 
					        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
 | 
				
			||||||
 | 
					        ++src_ptr;
 | 
				
			||||||
 | 
					        output_ptr += intermediate_height;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      src_ptr += src_next_row_stride;
 | 
				
			||||||
 | 
					      output_ptr += intermediate_next_stride;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Vertical pass (transposed intermediate -> dst).
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    uint8_t *src_ptr = intermediate_buffer;
 | 
				
			||||||
 | 
					    const int dst_next_row_stride = dst_stride - output_width;
 | 
				
			||||||
 | 
					    unsigned int i, j;
 | 
				
			||||||
 | 
					    for (i = 0; i < output_height; ++i) {
 | 
				
			||||||
 | 
					      for (j = 0; j < output_width; ++j) {
 | 
				
			||||||
 | 
					        // Apply filter...
 | 
				
			||||||
 | 
					        int temp = ((int)src_ptr[0] * VFilter[0]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[1] * VFilter[1]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[2] * VFilter[2]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[3] * VFilter[3]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[4] * VFilter[4]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[5] * VFilter[5]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[6] * VFilter[6]) +
 | 
				
			||||||
 | 
					                   ((int)src_ptr[7] * VFilter[7]) +
 | 
				
			||||||
 | 
					                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Normalize back to 0-255...
 | 
				
			||||||
 | 
					        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
 | 
				
			||||||
 | 
					        src_ptr += intermediate_height;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      src_ptr += intermediate_next_stride;
 | 
				
			||||||
 | 
					      dst_ptr += dst_next_row_stride;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void block2d_average_c(uint8_t *src,
 | 
				
			||||||
 | 
					                              unsigned int src_stride,
 | 
				
			||||||
 | 
					                              uint8_t *output_ptr,
 | 
				
			||||||
 | 
					                              unsigned int output_stride,
 | 
				
			||||||
 | 
					                              unsigned int output_width,
 | 
				
			||||||
 | 
					                              unsigned int output_height) {
 | 
				
			||||||
 | 
					  unsigned int i, j;
 | 
				
			||||||
 | 
					  for (i = 0; i < output_height; ++i) {
 | 
				
			||||||
 | 
					    for (j = 0; j < output_width; ++j) {
 | 
				
			||||||
 | 
					      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    output_ptr += output_stride;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void filter_average_block2d_8_c(const uint8_t *src_ptr,
 | 
				
			||||||
 | 
					                                       const unsigned int src_stride,
 | 
				
			||||||
 | 
					                                       const int16_t *HFilter,
 | 
				
			||||||
 | 
					                                       const int16_t *VFilter,
 | 
				
			||||||
 | 
					                                       uint8_t *dst_ptr,
 | 
				
			||||||
 | 
					                                       unsigned int dst_stride,
 | 
				
			||||||
 | 
					                                       unsigned int output_width,
 | 
				
			||||||
 | 
					                                       unsigned int output_height) {
 | 
				
			||||||
 | 
					  uint8_t tmp[16*16];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(output_width <= 16);
 | 
				
			||||||
 | 
					  assert(output_height <= 16);
 | 
				
			||||||
 | 
					  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 16,
 | 
				
			||||||
 | 
					                     output_width, output_height);
 | 
				
			||||||
 | 
					  block2d_average_c(tmp, 16, dst_ptr, dst_stride,
 | 
				
			||||||
 | 
					                    output_width, output_height);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
 | 
				
			||||||
 | 
					 public:
 | 
				
			||||||
 | 
					  static void SetUpTestCase() {
 | 
				
			||||||
 | 
					    // Force input_ to be unaligned, output to be 16 byte aligned.
 | 
				
			||||||
 | 
					    input_ = reinterpret_cast<uint8_t*>(
 | 
				
			||||||
 | 
					        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
 | 
				
			||||||
 | 
					        + 1;
 | 
				
			||||||
 | 
					    output_ = reinterpret_cast<uint8_t*>(
 | 
				
			||||||
 | 
					        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static void TearDownTestCase() {
 | 
				
			||||||
 | 
					    vpx_free(input_ - 1);
 | 
				
			||||||
 | 
					    input_ = NULL;
 | 
				
			||||||
 | 
					    vpx_free(output_);
 | 
				
			||||||
 | 
					    output_ = NULL;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  protected:
 | 
				
			||||||
 | 
					    static const int kDataAlignment = 16;
 | 
				
			||||||
 | 
					    static const int kOuterBlockSize = 32;
 | 
				
			||||||
 | 
					    static const int kInputStride = kOuterBlockSize;
 | 
				
			||||||
 | 
					    static const int kOutputStride = kOuterBlockSize;
 | 
				
			||||||
 | 
					    static const int kMaxDimension = 16;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int Width() const { return GET_PARAM(0); }
 | 
				
			||||||
 | 
					    int Height() const { return GET_PARAM(1); }
 | 
				
			||||||
 | 
					    int BorderLeft() const {
 | 
				
			||||||
 | 
					      const int center = (kOuterBlockSize - Width()) / 2;
 | 
				
			||||||
 | 
					      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    bool IsIndexInBorder(int i) {
 | 
				
			||||||
 | 
					      return (i < BorderTop() * kOuterBlockSize ||
 | 
				
			||||||
 | 
					              i >= (BorderTop() + Height()) * kOuterBlockSize ||
 | 
				
			||||||
 | 
					              i % kOuterBlockSize < BorderLeft() ||
 | 
				
			||||||
 | 
					              i % kOuterBlockSize >= (BorderLeft() + Width()));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    virtual void SetUp() {
 | 
				
			||||||
 | 
					      UUT_ = GET_PARAM(2);
 | 
				
			||||||
 | 
					      memset(input_, 0, sizeof(input_));
 | 
				
			||||||
 | 
					      /* Set up guard blocks for an inner block cetered in the outer block */
 | 
				
			||||||
 | 
					      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
 | 
				
			||||||
 | 
					        if (IsIndexInBorder(i))
 | 
				
			||||||
 | 
					          output_[i] = 255;
 | 
				
			||||||
 | 
					        else
 | 
				
			||||||
 | 
					          output_[i] = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      ::libvpx_test::ACMRandom prng;
 | 
				
			||||||
 | 
					      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)
 | 
				
			||||||
 | 
					        input_[i] = prng.Rand8();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void CheckGuardBlocks() {
 | 
				
			||||||
 | 
					      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
 | 
				
			||||||
 | 
					        if (IsIndexInBorder(i))
 | 
				
			||||||
 | 
					          EXPECT_EQ(255, output_[i]);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    uint8_t* input() {
 | 
				
			||||||
 | 
					      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    uint8_t* output() {
 | 
				
			||||||
 | 
					      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const ConvolveFunctions* UUT_;
 | 
				
			||||||
 | 
					    static uint8_t* input_;
 | 
				
			||||||
 | 
					    static uint8_t* output_;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					uint8_t* ConvolveTest::input_ = NULL;
 | 
				
			||||||
 | 
					uint8_t* ConvolveTest::output_ = NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, GuardBlocks) {
 | 
				
			||||||
 | 
					  CheckGuardBlocks();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, CopyHoriz) {
 | 
				
			||||||
 | 
					  uint8_t* const in = input();
 | 
				
			||||||
 | 
					  uint8_t* const out = output();
 | 
				
			||||||
 | 
					  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					      UUT_->h8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
 | 
				
			||||||
 | 
					                Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  CheckGuardBlocks();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int y = 0; y < Height(); ++y)
 | 
				
			||||||
 | 
					    for (int x = 0; x < Width(); ++x)
 | 
				
			||||||
 | 
					      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])
 | 
				
			||||||
 | 
					          << "(" << x << "," << y << ")";
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, CopyVert) {
 | 
				
			||||||
 | 
					  uint8_t* const in = input();
 | 
				
			||||||
 | 
					  uint8_t* const out = output();
 | 
				
			||||||
 | 
					  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					      UUT_->v8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
 | 
				
			||||||
 | 
					                Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  CheckGuardBlocks();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int y = 0; y < Height(); ++y)
 | 
				
			||||||
 | 
					    for (int x = 0; x < Width(); ++x)
 | 
				
			||||||
 | 
					      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])
 | 
				
			||||||
 | 
					          << "(" << x << "," << y << ")";
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, Copy2D) {
 | 
				
			||||||
 | 
					  uint8_t* const in = input();
 | 
				
			||||||
 | 
					  uint8_t* const out = output();
 | 
				
			||||||
 | 
					  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					      UUT_->hv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
 | 
				
			||||||
 | 
					                 Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  CheckGuardBlocks();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int y = 0; y < Height(); ++y)
 | 
				
			||||||
 | 
					    for (int x = 0; x < Width(); ++x)
 | 
				
			||||||
 | 
					      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])
 | 
				
			||||||
 | 
					          << "(" << x << "," << y << ")";
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
 | 
				
			||||||
 | 
					  uint8_t* const in = input();
 | 
				
			||||||
 | 
					  uint8_t* const out = output();
 | 
				
			||||||
 | 
					  uint8_t ref[kOutputStride * kMaxDimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int16_t filters[][8] = {
 | 
				
			||||||
 | 
					    { 0,   0,   0, 128,   0,   0,   0,  0},
 | 
				
			||||||
 | 
					    { 0,   1,  -5, 126,   8,  -3,   1,  0},
 | 
				
			||||||
 | 
					    { -1,   3, -10, 122,  18,  -6,   2,  0},
 | 
				
			||||||
 | 
					    { -1,   4, -13, 118,  27,  -9,   3, -1},
 | 
				
			||||||
 | 
					    { -1,   4, -16, 112,  37, -11,   4, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -18, 105,  48, -14,   4, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -19,  97,  58, -16,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   6, -19,  88,  68, -18,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   6, -19,  78,  78, -19,   6, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -18,  68,  88, -19,   6, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -16,  58,  97, -19,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   4, -14,  48, 105, -18,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   4, -11,  37, 112, -16,   4, -1},
 | 
				
			||||||
 | 
					    { -1,   3,  -9,  27, 118, -13,   4, -1},
 | 
				
			||||||
 | 
					    { 0,   2,  -6,  18, 122, -10,   3, -1},
 | 
				
			||||||
 | 
					    { 0,   1,  -3,   8, 126,  -5,   1,  0}
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int kNumFilters = sizeof(filters) / sizeof(filters[0]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
 | 
				
			||||||
 | 
					    for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
 | 
				
			||||||
 | 
					      filter_block2d_8_c(in, kInputStride,
 | 
				
			||||||
 | 
					                         filters[filter_x], filters[filter_y],
 | 
				
			||||||
 | 
					                         ref, kOutputStride,
 | 
				
			||||||
 | 
					                         Width(), Height());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if (filter_x && filter_y)
 | 
				
			||||||
 | 
					        REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					            UUT_->hv8_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                       filters[filter_x], 16, filters[filter_y], 16,
 | 
				
			||||||
 | 
					                       Width(), Height()));
 | 
				
			||||||
 | 
					      else if (filter_y)
 | 
				
			||||||
 | 
					        REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					            UUT_->v8_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                      filters[filter_x], 16, filters[filter_y], 16,
 | 
				
			||||||
 | 
					                      Width(), Height()));
 | 
				
			||||||
 | 
					      else
 | 
				
			||||||
 | 
					        REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					            UUT_->h8_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                      filters[filter_x], 16, filters[filter_y], 16,
 | 
				
			||||||
 | 
					                      Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      CheckGuardBlocks();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (int y = 0; y < Height(); ++y)
 | 
				
			||||||
 | 
					        for (int x = 0; x < Width(); ++x)
 | 
				
			||||||
 | 
					          ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])
 | 
				
			||||||
 | 
					              << "mismatch at (" << x << "," << y << "), "
 | 
				
			||||||
 | 
					              << "filters (" << filter_x << "," << filter_y << ")";
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {
 | 
				
			||||||
 | 
					  uint8_t* const in = input();
 | 
				
			||||||
 | 
					  uint8_t* const out = output();
 | 
				
			||||||
 | 
					  uint8_t ref[kOutputStride * kMaxDimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Populate ref and out with some random data
 | 
				
			||||||
 | 
					  ::libvpx_test::ACMRandom prng;
 | 
				
			||||||
 | 
					  for (int y = 0; y < Height(); ++y) {
 | 
				
			||||||
 | 
					    for (int x = 0; x < Width(); ++x) {
 | 
				
			||||||
 | 
					      const uint8_t r = prng.Rand8();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      out[y * kOutputStride + x] = r;
 | 
				
			||||||
 | 
					      ref[y * kOutputStride + x] = r;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int16_t filters[][8] = {
 | 
				
			||||||
 | 
					    { 0,   0,   0, 128,   0,   0,   0,  0},
 | 
				
			||||||
 | 
					    { 0,   1,  -5, 126,   8,  -3,   1,  0},
 | 
				
			||||||
 | 
					    { -1,   3, -10, 122,  18,  -6,   2,  0},
 | 
				
			||||||
 | 
					    { -1,   4, -13, 118,  27,  -9,   3, -1},
 | 
				
			||||||
 | 
					    { -1,   4, -16, 112,  37, -11,   4, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -18, 105,  48, -14,   4, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -19,  97,  58, -16,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   6, -19,  88,  68, -18,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   6, -19,  78,  78, -19,   6, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -18,  68,  88, -19,   6, -1},
 | 
				
			||||||
 | 
					    { -1,   5, -16,  58,  97, -19,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   4, -14,  48, 105, -18,   5, -1},
 | 
				
			||||||
 | 
					    { -1,   4, -11,  37, 112, -16,   4, -1},
 | 
				
			||||||
 | 
					    { -1,   3,  -9,  27, 118, -13,   4, -1},
 | 
				
			||||||
 | 
					    { 0,   2,  -6,  18, 122, -10,   3, -1},
 | 
				
			||||||
 | 
					    { 0,   1,  -3,   8, 126,  -5,   1,  0}
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int kNumFilters = sizeof(filters) / sizeof(filters[0]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
 | 
				
			||||||
 | 
					    for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
 | 
				
			||||||
 | 
					      filter_average_block2d_8_c(in, kInputStride,
 | 
				
			||||||
 | 
					                                 filters[filter_x], filters[filter_y],
 | 
				
			||||||
 | 
					                                 ref, kOutputStride,
 | 
				
			||||||
 | 
					                                 Width(), Height());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if (filter_x && filter_y)
 | 
				
			||||||
 | 
					        REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					            UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                           filters[filter_x], 16, filters[filter_y], 16,
 | 
				
			||||||
 | 
					                           Width(), Height()));
 | 
				
			||||||
 | 
					      else if (filter_y)
 | 
				
			||||||
 | 
					        REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					            UUT_->v8_avg_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                          filters[filter_x], 16, filters[filter_y], 16,
 | 
				
			||||||
 | 
					                          Width(), Height()));
 | 
				
			||||||
 | 
					      else
 | 
				
			||||||
 | 
					        REGISTER_STATE_CHECK(
 | 
				
			||||||
 | 
					            UUT_->h8_avg_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                          filters[filter_x], 16, filters[filter_y], 16,
 | 
				
			||||||
 | 
					                          Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      CheckGuardBlocks();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (int y = 0; y < Height(); ++y)
 | 
				
			||||||
 | 
					        for (int x = 0; x < Width(); ++x)
 | 
				
			||||||
 | 
					          ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])
 | 
				
			||||||
 | 
					              << "mismatch at (" << x << "," << y << "), "
 | 
				
			||||||
 | 
					              << "filters (" << filter_x << "," << filter_y << ")";
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST_P(ConvolveTest, ChangeFilterWorks) {
 | 
				
			||||||
 | 
					  uint8_t* const in = input();
 | 
				
			||||||
 | 
					  uint8_t* const out = output();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int16_t filters[][8] = {
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0, 128},
 | 
				
			||||||
 | 
					    { 0, 128},
 | 
				
			||||||
 | 
					    { 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0, 128},
 | 
				
			||||||
 | 
					    { 0, 128},
 | 
				
			||||||
 | 
					    { 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0,   0, 128},
 | 
				
			||||||
 | 
					    { 0,   0, 128},
 | 
				
			||||||
 | 
					    { 0, 128},
 | 
				
			||||||
 | 
					    { 128},
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                                 filters[0], 17, filters[4], 16,
 | 
				
			||||||
 | 
					                                 Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int x = 0; x < (Width() > 4 ? 8 : 4); ++x) {
 | 
				
			||||||
 | 
					    ASSERT_EQ(in[4], out[x]) << "x == " << x;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                                 filters[4], 16, filters[0], 17,
 | 
				
			||||||
 | 
					                                 Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int y = 0; y < (Height() > 4 ? 8 : 4); ++y) {
 | 
				
			||||||
 | 
					    ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
 | 
				
			||||||
 | 
					                                  filters[0], 17, filters[0], 17,
 | 
				
			||||||
 | 
					                                  Width(), Height()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (int y = 0; y < (Height() > 4 ? 8 : 4); ++y) {
 | 
				
			||||||
 | 
					    for (int x = 0; x < (Width() > 4 ? 8 : 4); ++x) {
 | 
				
			||||||
 | 
					      ASSERT_EQ(in[4 * kInputStride + 4], out[y * kOutputStride + x])
 | 
				
			||||||
 | 
					          << "x == " << x << ", y == " << y;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using std::tr1::make_tuple;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const ConvolveFunctions convolve8_2d_only_c(
 | 
				
			||||||
 | 
					    vp9_convolve8_c, vp9_convolve8_avg_c,
 | 
				
			||||||
 | 
					    vp9_convolve8_c, vp9_convolve8_avg_c,
 | 
				
			||||||
 | 
					    vp9_convolve8_c, vp9_convolve8_avg_c);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const ConvolveFunctions convolve8_c(
 | 
				
			||||||
 | 
					    vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
 | 
				
			||||||
 | 
					    vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
 | 
				
			||||||
 | 
					    vp9_convolve8_c, vp9_convolve8_avg_c);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
 | 
				
			||||||
 | 
					    make_tuple(4, 4, &convolve8_2d_only_c),
 | 
				
			||||||
 | 
					    make_tuple(8, 4, &convolve8_2d_only_c),
 | 
				
			||||||
 | 
					    make_tuple(8, 8, &convolve8_2d_only_c),
 | 
				
			||||||
 | 
					    make_tuple(16, 16, &convolve8_2d_only_c),
 | 
				
			||||||
 | 
					    make_tuple(4, 4, &convolve8_c),
 | 
				
			||||||
 | 
					    make_tuple(8, 4, &convolve8_c),
 | 
				
			||||||
 | 
					    make_tuple(8, 8, &convolve8_c),
 | 
				
			||||||
 | 
					    make_tuple(16, 16, &convolve8_c)));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if HAVE_SSSE3
 | 
				
			||||||
 | 
					const ConvolveFunctions convolve8_ssse3(
 | 
				
			||||||
 | 
					    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
 | 
				
			||||||
 | 
					    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
 | 
				
			||||||
 | 
					    vp9_convolve8_ssse3, vp9_convolve8_avg_c);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
 | 
				
			||||||
 | 
					    make_tuple(4, 4, &convolve8_ssse3),
 | 
				
			||||||
 | 
					    make_tuple(8, 4, &convolve8_ssse3),
 | 
				
			||||||
 | 
					    make_tuple(8, 8, &convolve8_ssse3),
 | 
				
			||||||
 | 
					    make_tuple(16, 16, &convolve8_ssse3)));
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
@@ -70,6 +70,7 @@ LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc
 | 
				
			|||||||
LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 | 
					LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
 | 
				
			||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 | 
					LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 | 
				
			||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 | 
					LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 | 
				
			||||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 | 
					LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -11,8 +11,6 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include "./vpx_config.h"
 | 
					#include "./vpx_config.h"
 | 
				
			||||||
#include "vp9_rtcd.h"
 | 
					#include "vp9_rtcd.h"
 | 
				
			||||||
#include "vp9/common/vp9_subpixel.h"
 | 
					 | 
				
			||||||
#include "vp9/common/vp9_loopfilter.h"
 | 
					 | 
				
			||||||
#include "vp9/common/vp9_onyxc_int.h"
 | 
					#include "vp9/common/vp9_onyxc_int.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_machine_specific_config(VP9_COMMON *ctx) {
 | 
					void vp9_machine_specific_config(VP9_COMMON *ctx) {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,7 +8,6 @@
 | 
				
			|||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 *  be found in the AUTHORS file in the root of the source tree.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "vp9/common/vp9_subpixel.h"
 | 
					 | 
				
			||||||
#include "vp9/common/vp9_loopfilter.h"
 | 
					#include "vp9/common/vp9_loopfilter.h"
 | 
				
			||||||
#include "recon.h"
 | 
					#include "recon.h"
 | 
				
			||||||
#include "vp9/common/vp9_onyxc_int.h"
 | 
					#include "vp9/common/vp9_onyxc_int.h"
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,9 +16,9 @@ void vpx_log(const char *format, ...);
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include "./vpx_config.h"
 | 
					#include "./vpx_config.h"
 | 
				
			||||||
#include "vpx_scale/yv12config.h"
 | 
					#include "vpx_scale/yv12config.h"
 | 
				
			||||||
 | 
					#include "vp9/common/vp9_convolve.h"
 | 
				
			||||||
#include "vp9/common/vp9_mv.h"
 | 
					#include "vp9/common/vp9_mv.h"
 | 
				
			||||||
#include "vp9/common/vp9_treecoder.h"
 | 
					#include "vp9/common/vp9_treecoder.h"
 | 
				
			||||||
#include "vp9/common/vp9_subpixel.h"
 | 
					 | 
				
			||||||
#include "vpx_ports/mem.h"
 | 
					#include "vpx_ports/mem.h"
 | 
				
			||||||
#include "vp9/common/vp9_common.h"
 | 
					#include "vp9/common/vp9_common.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -394,15 +394,8 @@ typedef struct macroblockd {
 | 
				
			|||||||
  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
 | 
					  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
 | 
				
			||||||
  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
 | 
					  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct subpix_fn_table  subpix;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict4x4;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict8x4;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict8x8;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict16x16;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict_avg4x4;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict_avg8x4;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict_avg8x8;
 | 
					 | 
				
			||||||
  vp9_subpix_fn_t  subpixel_predict_avg16x16;
 | 
					 | 
				
			||||||
  int allow_high_precision_mv;
 | 
					  int allow_high_precision_mv;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int corrupted;
 | 
					  int corrupted;
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										345
									
								
								vp9/common/vp9_convolve.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										345
									
								
								vp9/common/vp9_convolve.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,345 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *  Use of this source code is governed by a BSD-style license
 | 
				
			||||||
 | 
					 *  that can be found in the LICENSE file in the root of the source
 | 
				
			||||||
 | 
					 *  tree. An additional intellectual property rights grant can be found
 | 
				
			||||||
 | 
					 *  in the file PATENTS.  All contributing project authors may
 | 
				
			||||||
 | 
					 *  be found in the AUTHORS file in the root of the source tree.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#include <assert.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "./vpx_config.h"
 | 
				
			||||||
 | 
					#include "./vp9_rtcd.h"
 | 
				
			||||||
 | 
					#include "vp9/common/vp9_common.h"
 | 
				
			||||||
 | 
					#include "vpx/vpx_integer.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VP9_FILTER_WEIGHT 128
 | 
				
			||||||
 | 
					#define VP9_FILTER_SHIFT  7
 | 
				
			||||||
 | 
					#define ALIGN_FILTERS_256 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Assume a bank of 16 filters to choose from. There are two implementations
 | 
				
			||||||
 | 
					 * for filter wrapping behavior, since we want to be able to pick which filter
 | 
				
			||||||
 | 
					 * to start with. We could either:
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * 1) make filter_ a pointer to the base of the filter array, and then add an
 | 
				
			||||||
 | 
					 *    additional offset parameter, to choose the starting filter.
 | 
				
			||||||
 | 
					 * 2) use a pointer to 2 periods worth of filters, so that even if the original
 | 
				
			||||||
 | 
					 *    phase offset is at 15/16, we'll have valid data to read. The filter
 | 
				
			||||||
 | 
					 *    tables become [32][8], and the second half is duplicated.
 | 
				
			||||||
 | 
					 * 3) fix the alignment of the filter tables, so that we know the 0/16 is
 | 
				
			||||||
 | 
					 *    always 256 byte aligned.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
 | 
				
			||||||
 | 
					 * parameters, and switching between them is trivial.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void convolve_horiz_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                             uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                             const int16_t *filter_x0, int x_step_q4,
 | 
				
			||||||
 | 
					                             const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                             int w, int h, int taps) {
 | 
				
			||||||
 | 
					  int x, y, k, sum;
 | 
				
			||||||
 | 
					  const int16_t *filter_x_base = filter_x0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if ALIGN_FILTERS_256
 | 
				
			||||||
 | 
					  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* Adjust base pointer address for this source line */
 | 
				
			||||||
 | 
					  src -= taps / 2 - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (y = 0; y < h; ++y) {
 | 
				
			||||||
 | 
					    /* Pointer to filter to use */
 | 
				
			||||||
 | 
					    const int16_t *filter_x = filter_x0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /* Initial phase offset */
 | 
				
			||||||
 | 
					    int x_q4 = (filter_x - filter_x_base) / taps;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (x = 0; x < w; ++x) {
 | 
				
			||||||
 | 
					      /* Per-pixel src offset */
 | 
				
			||||||
 | 
					      int src_x = x_q4 >> 4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (sum = 0, k = 0; k < taps; ++k) {
 | 
				
			||||||
 | 
					        sum += src[src_x + k] * filter_x[k];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      sum += (VP9_FILTER_WEIGHT >> 1);
 | 
				
			||||||
 | 
					      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      /* Adjust source and filter to use for the next pixel */
 | 
				
			||||||
 | 
					      x_q4 += x_step_q4;
 | 
				
			||||||
 | 
					      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    src += src_stride;
 | 
				
			||||||
 | 
					    dst += dst_stride;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                                 uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                                 const int16_t *filter_x0, int x_step_q4,
 | 
				
			||||||
 | 
					                                 const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                                 int w, int h, int taps) {
 | 
				
			||||||
 | 
					  int x, y, k, sum;
 | 
				
			||||||
 | 
					  const int16_t *filter_x_base = filter_x0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if ALIGN_FILTERS_256
 | 
				
			||||||
 | 
					  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* Adjust base pointer address for this source line */
 | 
				
			||||||
 | 
					  src -= taps / 2 - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (y = 0; y < h; ++y) {
 | 
				
			||||||
 | 
					    /* Pointer to filter to use */
 | 
				
			||||||
 | 
					    const int16_t *filter_x = filter_x0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /* Initial phase offset */
 | 
				
			||||||
 | 
					    int x_q4 = (filter_x - filter_x_base) / taps;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (x = 0; x < w; ++x) {
 | 
				
			||||||
 | 
					      /* Per-pixel src offset */
 | 
				
			||||||
 | 
					      int src_x = x_q4 >> 4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (sum = 0, k = 0; k < taps; ++k) {
 | 
				
			||||||
 | 
					        sum += src[src_x + k] * filter_x[k];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      sum += (VP9_FILTER_WEIGHT >> 1);
 | 
				
			||||||
 | 
					      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      /* Adjust source and filter to use for the next pixel */
 | 
				
			||||||
 | 
					      x_q4 += x_step_q4;
 | 
				
			||||||
 | 
					      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    src += src_stride;
 | 
				
			||||||
 | 
					    dst += dst_stride;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void convolve_vert_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                            uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                            const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                            const int16_t *filter_y0, int y_step_q4,
 | 
				
			||||||
 | 
					                            int w, int h, int taps) {
 | 
				
			||||||
 | 
					  int x, y, k, sum;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int16_t *filter_y_base = filter_y0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if ALIGN_FILTERS_256
 | 
				
			||||||
 | 
					  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* Adjust base pointer address for this source column */
 | 
				
			||||||
 | 
					  src -= src_stride * (taps / 2 - 1);
 | 
				
			||||||
 | 
					  for (x = 0; x < w; ++x) {
 | 
				
			||||||
 | 
					    /* Pointer to filter to use */
 | 
				
			||||||
 | 
					    const int16_t *filter_y = filter_y0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /* Initial phase offset */
 | 
				
			||||||
 | 
					    int y_q4 = (filter_y - filter_y_base) / taps;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (y = 0; y < h; ++y) {
 | 
				
			||||||
 | 
					      /* Per-pixel src offset */
 | 
				
			||||||
 | 
					      int src_y = y_q4 >> 4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (sum = 0, k = 0; k < taps; ++k) {
 | 
				
			||||||
 | 
					        sum += src[(src_y + k) * src_stride] * filter_y[k];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      sum += (VP9_FILTER_WEIGHT >> 1);
 | 
				
			||||||
 | 
					      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      /* Adjust source and filter to use for the next pixel */
 | 
				
			||||||
 | 
					      y_q4 += y_step_q4;
 | 
				
			||||||
 | 
					      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    ++src;
 | 
				
			||||||
 | 
					    ++dst;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                                uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                                const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                                const int16_t *filter_y0, int y_step_q4,
 | 
				
			||||||
 | 
					                                int w, int h, int taps) {
 | 
				
			||||||
 | 
					  int x, y, k, sum;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int16_t *filter_y_base = filter_y0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if ALIGN_FILTERS_256
 | 
				
			||||||
 | 
					  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* Adjust base pointer address for this source column */
 | 
				
			||||||
 | 
					  src -= src_stride * (taps / 2 - 1);
 | 
				
			||||||
 | 
					  for (x = 0; x < w; ++x) {
 | 
				
			||||||
 | 
					    /* Pointer to filter to use */
 | 
				
			||||||
 | 
					    const int16_t *filter_y = filter_y0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /* Initial phase offset */
 | 
				
			||||||
 | 
					    int y_q4 = (filter_y - filter_y_base) / taps;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (y = 0; y < h; ++y) {
 | 
				
			||||||
 | 
					      /* Per-pixel src offset */
 | 
				
			||||||
 | 
					      int src_y = y_q4 >> 4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (sum = 0, k = 0; k < taps; ++k) {
 | 
				
			||||||
 | 
					        sum += src[(src_y + k) * src_stride] * filter_y[k];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      sum += (VP9_FILTER_WEIGHT >> 1);
 | 
				
			||||||
 | 
					      dst[y * dst_stride] =
 | 
				
			||||||
 | 
					          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      /* Adjust source and filter to use for the next pixel */
 | 
				
			||||||
 | 
					      y_q4 += y_step_q4;
 | 
				
			||||||
 | 
					      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    ++src;
 | 
				
			||||||
 | 
					    ++dst;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void convolve_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                       uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                       const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                       const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                       int w, int h, int taps) {
 | 
				
			||||||
 | 
					  /* Fixed size intermediate buffer places limits on parameters. */
 | 
				
			||||||
 | 
					  uint8_t temp[16 * 23];
 | 
				
			||||||
 | 
					  assert(w <= 16);
 | 
				
			||||||
 | 
					  assert(h <= 16);
 | 
				
			||||||
 | 
					  assert(taps <= 8);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
 | 
				
			||||||
 | 
					                   temp, 16,
 | 
				
			||||||
 | 
					                   filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                   w, h + taps - 1, taps);
 | 
				
			||||||
 | 
					  convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
 | 
				
			||||||
 | 
					                  filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                  w, h, taps);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void convolve_avg_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                           uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                           const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                           const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                           int w, int h, int taps) {
 | 
				
			||||||
 | 
					  /* Fixed size intermediate buffer places limits on parameters. */
 | 
				
			||||||
 | 
					  uint8_t temp[16 * 23];
 | 
				
			||||||
 | 
					  assert(w <= 16);
 | 
				
			||||||
 | 
					  assert(h <= 16);
 | 
				
			||||||
 | 
					  assert(taps <= 8);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
 | 
				
			||||||
 | 
					                   temp, 16,
 | 
				
			||||||
 | 
					                   filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                   w, h + taps - 1, taps);
 | 
				
			||||||
 | 
					  convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
 | 
				
			||||||
 | 
					                      filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                      w, h, taps);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                           uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                           const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                           const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                           int w, int h) {
 | 
				
			||||||
 | 
					  convolve_horiz_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                   filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                   w, h, 8);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                               uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                               const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                               const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                               int w, int h) {
 | 
				
			||||||
 | 
					  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                       filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                       w, h, 8);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                          uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                          const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                          const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                          int w, int h) {
 | 
				
			||||||
 | 
					  convolve_vert_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                  filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                  w, h, 8);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                              uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                              const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                              const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                              int w, int h) {
 | 
				
			||||||
 | 
					  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                      filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                      w, h, 8);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                     uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                     const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                     const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                     int w, int h) {
 | 
				
			||||||
 | 
					  convolve_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					             filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					             w, h, 8);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                         uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                         const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                         const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                         int w, int h) {
 | 
				
			||||||
 | 
					  convolve_avg_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                 filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                 w, h, 8);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve_copy(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                       uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                       const int16_t *filter_x, int filter_x_stride,
 | 
				
			||||||
 | 
					                       const int16_t *filter_y, int filter_y_stride,
 | 
				
			||||||
 | 
					                       int w, int h) {
 | 
				
			||||||
 | 
					  if (h == 16) {
 | 
				
			||||||
 | 
					    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
 | 
				
			||||||
 | 
					  } else if (h == 8) {
 | 
				
			||||||
 | 
					    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
 | 
				
			||||||
 | 
					  } else if (w == 8) {
 | 
				
			||||||
 | 
					    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    // 4x4
 | 
				
			||||||
 | 
					    int r;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (r = 0; r < 4; ++r) {
 | 
				
			||||||
 | 
					#if !(CONFIG_FAST_UNALIGNED)
 | 
				
			||||||
 | 
					      dst[0]  = src[0];
 | 
				
			||||||
 | 
					      dst[1]  = src[1];
 | 
				
			||||||
 | 
					      dst[2]  = src[2];
 | 
				
			||||||
 | 
					      dst[3]  = src[3];
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					      *(uint32_t *)dst = *(const uint32_t *)src;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					      src += src_stride;
 | 
				
			||||||
 | 
					      dst += dst_stride;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve_avg(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                      uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                      const int16_t *filter_x, int filter_x_stride,
 | 
				
			||||||
 | 
					                      const int16_t *filter_y, int filter_y_stride,
 | 
				
			||||||
 | 
					                      int w, int h) {
 | 
				
			||||||
 | 
					  int x, y;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for (y = 0; y < h; ++y) {
 | 
				
			||||||
 | 
					    for (x = 0; x < w; ++x) {
 | 
				
			||||||
 | 
					      dst[x] = (dst[x] + src[x] + 1) >> 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    src += src_stride;
 | 
				
			||||||
 | 
					    dst += dst_stride;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										43
									
								
								vp9/common/vp9_convolve.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								vp9/common/vp9_convolve.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,43 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *  Use of this source code is governed by a BSD-style license
 | 
				
			||||||
 | 
					 *  that can be found in the LICENSE file in the root of the source
 | 
				
			||||||
 | 
					 *  tree. An additional intellectual property rights grant can be found
 | 
				
			||||||
 | 
					 *  in the file PATENTS.  All contributing project authors may
 | 
				
			||||||
 | 
					 *  be found in the AUTHORS file in the root of the source tree.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#ifndef VP9_COMMON_CONVOLVE_H_
 | 
				
			||||||
 | 
					#define VP9_COMMON_CONVOLVE_H_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "vpx/vpx_integer.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                              uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                              const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                              const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                              int w, int h);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Not a convolution, a block copy conforming to the convolution prototype
 | 
				
			||||||
 | 
					void vp9_convolve_copy(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                       uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                       const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                       const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                       int w, int h);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Not a convolution, a block average conforming to the convolution prototype
 | 
				
			||||||
 | 
					void vp9_convolve_avg(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                      uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                      const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                      const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                      int w, int h);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct subpix_fn_table {
 | 
				
			||||||
 | 
					  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 | 
				
			||||||
 | 
					  const int16_t (*filter_x)[8];
 | 
				
			||||||
 | 
					  const int16_t (*filter_y)[8];
 | 
				
			||||||
 | 
					  int x_step_q4;
 | 
				
			||||||
 | 
					  int y_step_q4;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif  // VP9_COMMON_CONVOLVE_H_
 | 
				
			||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -21,10 +21,17 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#define SUBPEL_SHIFTS 16
 | 
					#define SUBPEL_SHIFTS 16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
 | 
					extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
 | 
				
			||||||
extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
 | 
					extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
 | 
				
			||||||
extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
 | 
					extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
 | 
				
			||||||
extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
 | 
					extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
 | 
				
			||||||
extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
 | 
					extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 | 
				
			||||||
 | 
					// filter kernel as a 2 tap filter.
 | 
				
			||||||
 | 
					#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
 | 
				
			||||||
 | 
					                   sizeof(vp9_bilinear_filters[0][0]))
 | 
				
			||||||
 | 
					#define BF_OFFSET (BF_LENGTH / 2 - 1)
 | 
				
			||||||
 | 
					#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif  // VP9_COMMON_VP9_FILTER_H_
 | 
					#endif  // VP9_COMMON_VP9_FILTER_H_
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[2 * 16];
 | 
					  uint8_t temp2[2 * 16];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3,
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3,
 | 
				
			||||||
                                    src_pixels_per_line, 1, 3, 16, HFilter);
 | 
					                                    src_pixels_per_line, 1, 3, 16, HFilter);
 | 
				
			||||||
@@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[2 * 16];
 | 
					  uint8_t temp2[2 * 16];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3,
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3,
 | 
				
			||||||
                                    src_pixels_per_line, 1, 17, 2, HFilter);
 | 
					                                    src_pixels_per_line, 1, 17, 2, HFilter);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,66 +8,58 @@
 | 
				
			|||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 *  be found in the AUTHORS file in the root of the source tree.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <assert.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "./vpx_config.h"
 | 
					#include "./vpx_config.h"
 | 
				
			||||||
#include "vpx/vpx_integer.h"
 | 
					#include "vpx/vpx_integer.h"
 | 
				
			||||||
#include "vp9/common/vp9_blockd.h"
 | 
					#include "vp9/common/vp9_blockd.h"
 | 
				
			||||||
 | 
					#include "vp9/common/vp9_filter.h"
 | 
				
			||||||
#include "vp9/common/vp9_reconinter.h"
 | 
					#include "vp9/common/vp9_reconinter.h"
 | 
				
			||||||
#include "vp9/common/vp9_reconintra.h"
 | 
					#include "vp9/common/vp9_reconintra.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_setup_interp_filters(MACROBLOCKD *xd,
 | 
					void vp9_setup_interp_filters(MACROBLOCKD *xd,
 | 
				
			||||||
                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
 | 
					                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
 | 
				
			||||||
                              VP9_COMMON *cm) {
 | 
					                              VP9_COMMON *cm) {
 | 
				
			||||||
 | 
					  // TODO(agrange): Investigate the best choice of functions to use here
 | 
				
			||||||
 | 
					  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
 | 
				
			||||||
 | 
					  // to do at full-pel offsets. The current selection, where the filter is
 | 
				
			||||||
 | 
					  // applied in one direction only, and not at all for 0,0, seems to give the
 | 
				
			||||||
 | 
					  // best quality, but it may be worth trying an additional mode that does
 | 
				
			||||||
 | 
					  // do the filtering on full-pel.
 | 
				
			||||||
 | 
					  xd->subpix.predict[0][0][0] = vp9_convolve_copy;
 | 
				
			||||||
 | 
					  xd->subpix.predict[0][0][1] = vp9_convolve_avg;
 | 
				
			||||||
 | 
					  xd->subpix.predict[0][1][0] = vp9_convolve8_vert;
 | 
				
			||||||
 | 
					  xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert;
 | 
				
			||||||
 | 
					  xd->subpix.predict[1][0][0] = vp9_convolve8_horiz;
 | 
				
			||||||
 | 
					  xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz;
 | 
				
			||||||
 | 
					  xd->subpix.predict[1][1][0] = vp9_convolve8;
 | 
				
			||||||
 | 
					  xd->subpix.predict[1][1][1] = vp9_convolve8_avg;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  xd->subpix.x_step_q4 = 16;
 | 
				
			||||||
 | 
					  xd->subpix.y_step_q4 = 16;
 | 
				
			||||||
 | 
					  switch (mcomp_filter_type) {
 | 
				
			||||||
 | 
					    case EIGHTTAP:
 | 
				
			||||||
 | 
					    case SWITCHABLE:
 | 
				
			||||||
 | 
					      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					    case EIGHTTAP_SMOOTH:
 | 
				
			||||||
 | 
					      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					    case EIGHTTAP_SHARP:
 | 
				
			||||||
 | 
					      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
 | 
					    case BILINEAR:
 | 
				
			||||||
 | 
					      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
 | 
				
			||||||
 | 
					      break;
 | 
				
			||||||
#if CONFIG_ENABLE_6TAP
 | 
					#if CONFIG_ENABLE_6TAP
 | 
				
			||||||
  if (mcomp_filter_type == SIXTAP) {
 | 
					    case SIXTAP:
 | 
				
			||||||
    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;
 | 
					      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;
 | 
				
			||||||
    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;
 | 
					      break;
 | 
				
			||||||
    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;
 | 
					 | 
				
			||||||
    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
 | 
					 | 
				
			||||||
    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;
 | 
					 | 
				
			||||||
    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
 | 
					 | 
				
			||||||
  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {
 | 
					 | 
				
			||||||
    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;
 | 
					 | 
				
			||||||
    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;
 | 
					 | 
				
			||||||
  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
 | 
					 | 
				
			||||||
    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;
 | 
					 | 
				
			||||||
    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;
 | 
					 | 
				
			||||||
    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
 | 
					 | 
				
			||||||
    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#if CONFIG_ENABLE_6TAP
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_copy_mem16x16_c(uint8_t *src,
 | 
					void vp9_copy_mem16x16_c(const uint8_t *src,
 | 
				
			||||||
                         int src_stride,
 | 
					                         int src_stride,
 | 
				
			||||||
                         uint8_t *dst,
 | 
					                         uint8_t *dst,
 | 
				
			||||||
                         int dst_stride) {
 | 
					                         int dst_stride) {
 | 
				
			||||||
@@ -93,10 +85,10 @@ void vp9_copy_mem16x16_c(uint8_t *src,
 | 
				
			|||||||
    dst[15] = src[15];
 | 
					    dst[15] = src[15];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
 | 
					    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
 | 
				
			||||||
    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
 | 
					    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 | 
				
			||||||
    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
 | 
					    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
 | 
				
			||||||
    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
 | 
					    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    src += src_stride;
 | 
					    src += src_stride;
 | 
				
			||||||
@@ -104,25 +96,7 @@ void vp9_copy_mem16x16_c(uint8_t *src,
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_avg_mem16x16_c(uint8_t *src,
 | 
					void vp9_copy_mem8x8_c(const uint8_t *src,
 | 
				
			||||||
                        int src_stride,
 | 
					 | 
				
			||||||
                        uint8_t *dst,
 | 
					 | 
				
			||||||
                        int dst_stride) {
 | 
					 | 
				
			||||||
  int r;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  for (r = 0; r < 16; r++) {
 | 
					 | 
				
			||||||
    int n;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for (n = 0; n < 16; n++) {
 | 
					 | 
				
			||||||
      dst[n] = (dst[n] + src[n] + 1) >> 1;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    src += src_stride;
 | 
					 | 
				
			||||||
    dst += dst_stride;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_copy_mem8x8_c(uint8_t *src,
 | 
					 | 
				
			||||||
                       int src_stride,
 | 
					                       int src_stride,
 | 
				
			||||||
                       uint8_t *dst,
 | 
					                       uint8_t *dst,
 | 
				
			||||||
                       int dst_stride) {
 | 
					                       int dst_stride) {
 | 
				
			||||||
@@ -139,33 +113,15 @@ void vp9_copy_mem8x8_c(uint8_t *src,
 | 
				
			|||||||
    dst[6] = src[6];
 | 
					    dst[6] = src[6];
 | 
				
			||||||
    dst[7] = src[7];
 | 
					    dst[7] = src[7];
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
 | 
					    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
 | 
				
			||||||
    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
 | 
					    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    src += src_stride;
 | 
					    src += src_stride;
 | 
				
			||||||
    dst += dst_stride;
 | 
					    dst += dst_stride;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_avg_mem8x8_c(uint8_t *src,
 | 
					void vp9_copy_mem8x4_c(const uint8_t *src,
 | 
				
			||||||
                      int src_stride,
 | 
					 | 
				
			||||||
                      uint8_t *dst,
 | 
					 | 
				
			||||||
                      int dst_stride) {
 | 
					 | 
				
			||||||
  int r;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  for (r = 0; r < 8; r++) {
 | 
					 | 
				
			||||||
    int n;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for (n = 0; n < 8; n++) {
 | 
					 | 
				
			||||||
      dst[n] = (dst[n] + src[n] + 1) >> 1;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    src += src_stride;
 | 
					 | 
				
			||||||
    dst += dst_stride;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_copy_mem8x4_c(uint8_t *src,
 | 
					 | 
				
			||||||
                       int src_stride,
 | 
					                       int src_stride,
 | 
				
			||||||
                       uint8_t *dst,
 | 
					                       uint8_t *dst,
 | 
				
			||||||
                       int dst_stride) {
 | 
					                       int dst_stride) {
 | 
				
			||||||
@@ -182,16 +138,16 @@ void vp9_copy_mem8x4_c(uint8_t *src,
 | 
				
			|||||||
    dst[6] = src[6];
 | 
					    dst[6] = src[6];
 | 
				
			||||||
    dst[7] = src[7];
 | 
					    dst[7] = src[7];
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
 | 
					    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
 | 
				
			||||||
    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
 | 
					    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    src += src_stride;
 | 
					    src += src_stride;
 | 
				
			||||||
    dst += dst_stride;
 | 
					    dst += dst_stride;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
 | 
					void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
 | 
				
			||||||
  int r;
 | 
					                                  struct subpix_fn_table *subpix) {
 | 
				
			||||||
  uint8_t *ptr_base;
 | 
					  uint8_t *ptr_base;
 | 
				
			||||||
  uint8_t *ptr;
 | 
					  uint8_t *ptr;
 | 
				
			||||||
  uint8_t *pred_ptr = d->predictor;
 | 
					  uint8_t *pred_ptr = d->predictor;
 | 
				
			||||||
@@ -199,30 +155,14 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  ptr_base = *(d->base_pre);
 | 
					  ptr_base = *(d->base_pre);
 | 
				
			||||||
  mv.as_int = d->bmi.as_mv.first.as_int;
 | 
					  mv.as_int = d->bmi.as_mv.first.as_int;
 | 
				
			||||||
 | 
					  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
				
			||||||
 | 
					        (mv.as_mv.col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
 | 
					  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
 | 
				
			||||||
    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					      ptr, d->pre_stride, pred_ptr, pitch,
 | 
				
			||||||
          (mv.as_mv.col >> 3);
 | 
					      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
 | 
				
			||||||
    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
 | 
					      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
 | 
				
			||||||
         pred_ptr, pitch);
 | 
					      4, 4);
 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					 | 
				
			||||||
                (mv.as_mv.col >> 3);
 | 
					 | 
				
			||||||
    ptr = ptr_base;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for (r = 0; r < 4; r++) {
 | 
					 | 
				
			||||||
#if !(CONFIG_FAST_UNALIGNED)
 | 
					 | 
				
			||||||
      pred_ptr[0]  = ptr[0];
 | 
					 | 
				
			||||||
      pred_ptr[1]  = ptr[1];
 | 
					 | 
				
			||||||
      pred_ptr[2]  = ptr[2];
 | 
					 | 
				
			||||||
      pred_ptr[3]  = ptr[3];
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
      pred_ptr     += pitch;
 | 
					 | 
				
			||||||
      ptr         += d->pre_stride;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
@@ -232,8 +172,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
 | 
				
			|||||||
 * predictor of the second reference frame / motion vector.
 | 
					 * predictor of the second reference frame / motion vector.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 | 
					void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 | 
				
			||||||
                                      vp9_subpix_fn_t sppf) {
 | 
					                                      struct subpix_fn_table *subpix) {
 | 
				
			||||||
  int r;
 | 
					 | 
				
			||||||
  uint8_t *ptr_base;
 | 
					  uint8_t *ptr_base;
 | 
				
			||||||
  uint8_t *ptr;
 | 
					  uint8_t *ptr;
 | 
				
			||||||
  uint8_t *pred_ptr = d->predictor;
 | 
					  uint8_t *pred_ptr = d->predictor;
 | 
				
			||||||
@@ -241,26 +180,14 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  ptr_base = *(d->base_second_pre);
 | 
					  ptr_base = *(d->base_second_pre);
 | 
				
			||||||
  mv.as_int = d->bmi.as_mv.second.as_int;
 | 
					  mv.as_int = d->bmi.as_mv.second.as_int;
 | 
				
			||||||
 | 
					  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
				
			||||||
 | 
					        (mv.as_mv.col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
 | 
					  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
 | 
				
			||||||
    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					      ptr, d->pre_stride, pred_ptr, pitch,
 | 
				
			||||||
          (mv.as_mv.col >> 3);
 | 
					      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
 | 
				
			||||||
    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
 | 
					      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
 | 
				
			||||||
         pred_ptr, pitch);
 | 
					      4, 4);
 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					 | 
				
			||||||
                (mv.as_mv.col >> 3);
 | 
					 | 
				
			||||||
    ptr = ptr_base;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for (r = 0; r < 4; r++) {
 | 
					 | 
				
			||||||
      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;
 | 
					 | 
				
			||||||
      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;
 | 
					 | 
				
			||||||
      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;
 | 
					 | 
				
			||||||
      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;
 | 
					 | 
				
			||||||
      pred_ptr    += pitch;
 | 
					 | 
				
			||||||
      ptr         += d->pre_stride;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 | 
					void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 | 
				
			||||||
@@ -274,12 +201,11 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 | 
				
			|||||||
  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
				
			||||||
        (mv.as_mv.col >> 3);
 | 
					        (mv.as_mv.col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
 | 
					  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
 | 
				
			||||||
    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
 | 
					      ptr, d->pre_stride, pred_ptr, pitch,
 | 
				
			||||||
                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
 | 
					      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
 | 
				
			||||||
  } else {
 | 
					      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
 | 
				
			||||||
    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
 | 
					      8, 8);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
@@ -300,12 +226,11 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
 | 
				
			|||||||
  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
				
			||||||
        (mv.as_mv.col >> 3);
 | 
					        (mv.as_mv.col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
 | 
					  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
 | 
				
			||||||
    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
 | 
					      ptr, d->pre_stride, pred_ptr, pitch,
 | 
				
			||||||
                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
 | 
					      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
 | 
				
			||||||
  } else {
 | 
					      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
 | 
				
			||||||
    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
 | 
					      8, 8);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 | 
					static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 | 
				
			||||||
@@ -319,12 +244,11 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
 | 
				
			|||||||
  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
					  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
 | 
				
			||||||
        (mv.as_mv.col >> 3);
 | 
					        (mv.as_mv.col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
 | 
					  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
 | 
				
			||||||
    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
 | 
					      ptr, d->pre_stride, pred_ptr, pitch,
 | 
				
			||||||
                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
 | 
					      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
 | 
				
			||||||
  } else {
 | 
					      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
 | 
				
			||||||
    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
 | 
					      8, 4);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*encoder only*/
 | 
					/*encoder only*/
 | 
				
			||||||
@@ -411,13 +335,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
 | 
				
			|||||||
    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
 | 
					    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
 | 
				
			||||||
      build_inter_predictors2b(xd, d0, 8);
 | 
					      build_inter_predictors2b(xd, d0, 8);
 | 
				
			||||||
    else {
 | 
					    else {
 | 
				
			||||||
      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
 | 
					      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
 | 
				
			||||||
      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
 | 
					      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
 | 
					    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
 | 
				
			||||||
      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
 | 
					      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
 | 
				
			||||||
      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
 | 
					      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -475,14 +399,11 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
 | 
					  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
 | 
					  xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0](
 | 
				
			||||||
      xd->subpixel_predict16x16(ptr, pre_stride,
 | 
					      ptr, pre_stride, dst_y, dst_ystride,
 | 
				
			||||||
                                (ymv.as_mv.col & 7) << 1,
 | 
					      xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
 | 
				
			||||||
                                (ymv.as_mv.row & 7) << 1,
 | 
					      xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
 | 
				
			||||||
                                dst_y, dst_ystride);
 | 
					      16, 16);
 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 | 
					void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 | 
				
			||||||
@@ -523,15 +444,19 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 | 
				
			|||||||
  uptr = xd->pre.u_buffer + offset;
 | 
					  uptr = xd->pre.u_buffer + offset;
 | 
				
			||||||
  vptr = xd->pre.v_buffer + offset;
 | 
					  vptr = xd->pre.v_buffer + offset;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (_o16x16mv.as_int & 0x000f000f) {
 | 
					  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
 | 
				
			||||||
      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
 | 
					                    [!!(_o16x16mv.as_mv.row & 15)][0](
 | 
				
			||||||
                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
 | 
					      uptr, pre_stride, dst_u, dst_uvstride,
 | 
				
			||||||
      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
 | 
					      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
 | 
				
			||||||
                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
 | 
					      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
 | 
				
			||||||
    } else {
 | 
					      8, 8);
 | 
				
			||||||
      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
 | 
					
 | 
				
			||||||
      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
 | 
					  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
 | 
				
			||||||
    }
 | 
					                    [!!(_o16x16mv.as_mv.row & 15)][0](
 | 
				
			||||||
 | 
					      vptr, pre_stride, dst_v, dst_uvstride,
 | 
				
			||||||
 | 
					      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
 | 
				
			||||||
 | 
					      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
 | 
				
			||||||
 | 
					      8, 8);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -714,12 +639,11 @@ void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 | 
					  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ((mv_row | mv_col) & 7) {
 | 
					  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1](
 | 
				
			||||||
    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
 | 
					      ptr, pre_stride, dst_y, dst_ystride,
 | 
				
			||||||
                                  (mv_row & 7) << 1, dst_y, dst_ystride);
 | 
					      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
 | 
				
			||||||
  } else {
 | 
					      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
 | 
				
			||||||
    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
 | 
					      16, 16);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 | 
					void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 | 
				
			||||||
@@ -758,15 +682,17 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
 | 
				
			|||||||
  uptr = xd->second_pre.u_buffer + offset;
 | 
					  uptr = xd->second_pre.u_buffer + offset;
 | 
				
			||||||
  vptr = xd->second_pre.v_buffer + offset;
 | 
					  vptr = xd->second_pre.v_buffer + offset;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ((omv_row | omv_col) & 15) {
 | 
					  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
 | 
				
			||||||
      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
 | 
					      uptr, pre_stride, dst_u, dst_uvstride,
 | 
				
			||||||
                                  omv_row & 15, dst_u, dst_uvstride);
 | 
					      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
 | 
				
			||||||
      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
 | 
					      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
 | 
				
			||||||
                                  omv_row & 15, dst_v, dst_uvstride);
 | 
					      8, 8);
 | 
				
			||||||
    } else {
 | 
					
 | 
				
			||||||
      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
 | 
					  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
 | 
				
			||||||
      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
 | 
					      vptr, pre_stride, dst_v, dst_uvstride,
 | 
				
			||||||
    }
 | 
					      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
 | 
				
			||||||
 | 
					      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
 | 
				
			||||||
 | 
					      8, 8);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
 | 
					void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
 | 
				
			||||||
@@ -835,13 +761,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
 | 
				
			|||||||
      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
 | 
					      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
 | 
				
			||||||
        build_inter_predictors2b(xd, d0, 16);
 | 
					        build_inter_predictors2b(xd, d0, 16);
 | 
				
			||||||
      else {
 | 
					      else {
 | 
				
			||||||
        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);
 | 
					        vp9_build_inter_predictors_b(d0, 16, &xd->subpix);
 | 
				
			||||||
        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);
 | 
					        vp9_build_inter_predictors_b(d1, 16, &xd->subpix);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if (mbmi->second_ref_frame > 0) {
 | 
					      if (mbmi->second_ref_frame > 0) {
 | 
				
			||||||
        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);
 | 
					        vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix);
 | 
				
			||||||
        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);
 | 
					        vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -853,13 +779,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
 | 
				
			|||||||
    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
 | 
					    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
 | 
				
			||||||
      build_inter_predictors2b(xd, d0, 8);
 | 
					      build_inter_predictors2b(xd, d0, 8);
 | 
				
			||||||
    else {
 | 
					    else {
 | 
				
			||||||
      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
 | 
					      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
 | 
				
			||||||
      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
 | 
					      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (mbmi->second_ref_frame > 0) {
 | 
					    if (mbmi->second_ref_frame > 0) {
 | 
				
			||||||
      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
 | 
					      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
 | 
				
			||||||
      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
 | 
					      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -14,6 +14,8 @@
 | 
				
			|||||||
#include "vpx/vpx_integer.h"
 | 
					#include "vpx/vpx_integer.h"
 | 
				
			||||||
#include "vp9/common/vp9_onyxc_int.h"
 | 
					#include "vp9/common/vp9_onyxc_int.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct subpix_fn_table;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
 | 
					extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
 | 
				
			||||||
                                                    uint8_t *dst_y,
 | 
					                                                    uint8_t *dst_y,
 | 
				
			||||||
                                                    int dst_ystride,
 | 
					                                                    int dst_ystride,
 | 
				
			||||||
@@ -64,10 +66,10 @@ extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
 | 
				
			|||||||
extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
 | 
					extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
 | 
					extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
 | 
				
			||||||
                                         vp9_subpix_fn_t sppf);
 | 
					                                         struct subpix_fn_table *sppf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 | 
					extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 | 
				
			||||||
                                             vp9_subpix_fn_t sppf);
 | 
					                                             struct subpix_fn_table *sppf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
 | 
					extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
 | 
				
			||||||
                                         int pitch);
 | 
					                                         int pitch);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -23,21 +23,6 @@ EOF
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
forward_decls vp9_common_forward_decls
 | 
					forward_decls vp9_common_forward_decls
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
 | 
					 | 
				
			||||||
prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
 | 
					 | 
				
			||||||
prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
 | 
					 | 
				
			||||||
prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
 | 
					 | 
				
			||||||
# compiles warning free but a dissassembly of generated code show bugs. To be
 | 
					 | 
				
			||||||
# on the safe side, only enabled when compiled with 'gcc'.
 | 
					 | 
				
			||||||
if [ "$CONFIG_GCC" = "yes" ]; then
 | 
					 | 
				
			||||||
    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
 | 
					 | 
				
			||||||
fi
 | 
					 | 
				
			||||||
    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
 | 
					 | 
				
			||||||
    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
 | 
					 | 
				
			||||||
    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Dequant
 | 
					# Dequant
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@@ -86,27 +71,17 @@ specialize vp9_dequant_idct_add_uv_block_16x16
 | 
				
			|||||||
#
 | 
					#
 | 
				
			||||||
# RECON
 | 
					# RECON
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
					prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
				
			||||||
specialize vp9_copy_mem16x16 mmx sse2 dspr2
 | 
					specialize vp9_copy_mem16x16 mmx sse2 dspr2
 | 
				
			||||||
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 | 
					vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
					prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
				
			||||||
specialize vp9_copy_mem8x8 mmx dspr2
 | 
					specialize vp9_copy_mem8x8 mmx dspr2
 | 
				
			||||||
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 | 
					vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
					prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
				
			||||||
specialize vp9_copy_mem8x4 mmx
 | 
					specialize vp9_copy_mem8x4 mmx
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_avg_mem16x16
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_avg_mem8x8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_copy_mem8x4 mmx dspr2
 | 
					 | 
				
			||||||
vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 | 
					prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 | 
				
			||||||
specialize vp9_recon_b
 | 
					specialize vp9_recon_b
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -269,110 +244,23 @@ specialize vp9_sub_pixel_variance16x2 sse2
 | 
				
			|||||||
#
 | 
					#
 | 
				
			||||||
# Sub Pixel Filters
 | 
					# Sub Pixel Filters
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 | 
				
			||||||
specialize vp9_eighttap_predict16x16
 | 
					specialize vp9_convolve8 ssse3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 | 
				
			||||||
specialize vp9_eighttap_predict8x8
 | 
					specialize vp9_convolve8_horiz ssse3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 | 
				
			||||||
specialize vp9_eighttap_predict_avg16x16
 | 
					specialize vp9_convolve8_vert ssse3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 | 
				
			||||||
specialize vp9_eighttap_predict_avg8x8
 | 
					specialize vp9_convolve8_avg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 | 
				
			||||||
specialize vp9_eighttap_predict_avg4x4
 | 
					specialize vp9_convolve8_avg_horiz
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 | 
				
			||||||
specialize vp9_eighttap_predict8x4
 | 
					specialize vp9_convolve8_avg_vert
 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict4x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict16x16_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict8x8_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict_avg16x16_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict_avg8x8_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict_avg4x4_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict8x4_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict4x4_sharp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict16x16_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict8x8_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict_avg16x16_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict_avg8x8_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict_avg4x4_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict8x4_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_eighttap_predict4x4_smooth
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict16x16
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict8x8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict_avg16x16
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict_avg8x8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict8x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict4x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_sixtap_predict_avg4x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict16x16 sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict8x8 sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict_avg16x16
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict_avg8x8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict8x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict4x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 | 
					 | 
				
			||||||
specialize vp9_bilinear_predict_avg4x4
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# dct
 | 
					# dct
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,20 +0,0 @@
 | 
				
			|||||||
/*
 | 
					 | 
				
			||||||
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *  Use of this source code is governed by a BSD-style license
 | 
					 | 
				
			||||||
 *  that can be found in the LICENSE file in the root of the source
 | 
					 | 
				
			||||||
 *  tree. An additional intellectual property rights grant can be found
 | 
					 | 
				
			||||||
 *  in the file PATENTS.  All contributing project authors may
 | 
					 | 
				
			||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef VP9_COMMON_VP9_SUBPIXEL_H_
 | 
					 | 
				
			||||||
#define VP9_COMMON_VP9_SUBPIXEL_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define prototype_subpixel_predict(sym) \
 | 
					 | 
				
			||||||
  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
 | 
					 | 
				
			||||||
           uint8_t *dst, int dst_pitch)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  // VP9_COMMON_VP9_SUBPIXEL_H_
 | 
					 | 
				
			||||||
@@ -8,91 +8,11 @@
 | 
				
			|||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 *  be found in the AUTHORS file in the root of the source tree.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <assert.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "./vpx_config.h"
 | 
					#include "./vpx_config.h"
 | 
				
			||||||
 | 
					#include "./vp9_rtcd.h"
 | 
				
			||||||
#include "vpx_ports/mem.h"
 | 
					#include "vpx_ports/mem.h"
 | 
				
			||||||
#include "vp9/common/vp9_subpixel.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern const short vp9_six_tap_mmx[8][6 * 8];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
 | 
					 | 
				
			||||||
                                      unsigned short  *output_ptr,
 | 
					 | 
				
			||||||
                                      unsigned int     src_pixels_per_line,
 | 
					 | 
				
			||||||
                                      unsigned int     pixel_step,
 | 
					 | 
				
			||||||
                                      unsigned int     output_height,
 | 
					 | 
				
			||||||
                                      unsigned int     output_width,
 | 
					 | 
				
			||||||
                                      const short     *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
 | 
					 | 
				
			||||||
                                       unsigned char  *output_ptr,
 | 
					 | 
				
			||||||
                                       int             output_pitch,
 | 
					 | 
				
			||||||
                                       unsigned int    pixels_per_line,
 | 
					 | 
				
			||||||
                                       unsigned int    pixel_step,
 | 
					 | 
				
			||||||
                                       unsigned int    output_height,
 | 
					 | 
				
			||||||
                                       unsigned int    output_width,
 | 
					 | 
				
			||||||
                                       const short    *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                        unsigned short *output_ptr,
 | 
					 | 
				
			||||||
                                        unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                        unsigned int    pixel_step,
 | 
					 | 
				
			||||||
                                        unsigned int    output_height,
 | 
					 | 
				
			||||||
                                        unsigned int    output_width,
 | 
					 | 
				
			||||||
                                        const short    *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned short *output_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                         unsigned int    pixel_step,
 | 
					 | 
				
			||||||
                                         unsigned int    output_height,
 | 
					 | 
				
			||||||
                                         unsigned int    output_width,
 | 
					 | 
				
			||||||
                                         const short    *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
 | 
					 | 
				
			||||||
                                        unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                        int dst_ptich,
 | 
					 | 
				
			||||||
                                        unsigned int pixels_per_line,
 | 
					 | 
				
			||||||
                                        unsigned int pixel_step,
 | 
					 | 
				
			||||||
                                        unsigned int output_height,
 | 
					 | 
				
			||||||
                                        unsigned int output_width,
 | 
					 | 
				
			||||||
                                        const short    *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                         int dst_ptich,
 | 
					 | 
				
			||||||
                                         unsigned int pixels_per_line,
 | 
					 | 
				
			||||||
                                         unsigned int pixel_step,
 | 
					 | 
				
			||||||
                                         unsigned int output_height,
 | 
					 | 
				
			||||||
                                         unsigned int output_width,
 | 
					 | 
				
			||||||
                                         const short    *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned short *output_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                         unsigned int    output_height,
 | 
					 | 
				
			||||||
                                         unsigned int    output_width);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                             unsigned int   src_pixels_per_line,
 | 
					 | 
				
			||||||
                                             unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                             int            dst_pitch,
 | 
					 | 
				
			||||||
                                             unsigned int   output_height,
 | 
					 | 
				
			||||||
                                             const short   *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                              unsigned int   src_pixels_per_lin,
 | 
					 | 
				
			||||||
                                              unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                              int            dst_pitch,
 | 
					 | 
				
			||||||
                                              unsigned int   output_height,
 | 
					 | 
				
			||||||
                                              const short   *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                             unsigned int   src_pixels_per_line,
 | 
					 | 
				
			||||||
                                             unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                             int            dst_pitch,
 | 
					 | 
				
			||||||
                                             unsigned int   output_height,
 | 
					 | 
				
			||||||
                                             const short   *vp9_filter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
///////////////////////////////////////////////////////////////////////////
 | 
					///////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// the mmx function that does the bilinear filtering and var calculation //
 | 
					// the mmx function that does the bilinear filtering and var calculation //
 | 
				
			||||||
// int one pass                                                          //
 | 
					// int one pass                                                          //
 | 
				
			||||||
@@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
 | 
				
			|||||||
  {   8,  8,  8,  8, 120, 120, 120, 120 }
 | 
					  {   8,  8,  8,  8, 120, 120, 120, 120 }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if HAVE_MMX
 | 
					 | 
				
			||||||
void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                               int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                               int  xoffset,
 | 
					 | 
				
			||||||
                               int  yoffset,
 | 
					 | 
				
			||||||
                               unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                               int  dst_pitch) {
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict4x4_mmx\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
  hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
 | 
					 | 
				
			||||||
                            src_pixels_per_line, 1, 9, 8, hfilter);
 | 
					 | 
				
			||||||
  vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
 | 
					 | 
				
			||||||
                             8, 4, 4, 4, vfilter);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                 int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 int  xoffset,
 | 
					 | 
				
			||||||
                                 int  yoffset,
 | 
					 | 
				
			||||||
                                 unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                 int dst_pitch) {
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict16x16_mmx\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                            fdata2,   src_pixels_per_line, 1, 21, 32,
 | 
					 | 
				
			||||||
                            hfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
 | 
					 | 
				
			||||||
                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,
 | 
					 | 
				
			||||||
                            hfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
 | 
					 | 
				
			||||||
                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,
 | 
					 | 
				
			||||||
                            hfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
 | 
					 | 
				
			||||||
                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,
 | 
					 | 
				
			||||||
                            hfilter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,
 | 
					 | 
				
			||||||
                             32, 16, 16, 16, vfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,
 | 
					 | 
				
			||||||
                             32, 16, 16, 16, vfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,
 | 
					 | 
				
			||||||
                             32, 16, 16, 16, vfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
 | 
					 | 
				
			||||||
                             32, 16, 16, 16, vfilter);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                               int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                               int  xoffset,
 | 
					 | 
				
			||||||
                               int  yoffset,
 | 
					 | 
				
			||||||
                               unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                               int  dst_pitch) {
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict8x8_mmx\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                            fdata2,   src_pixels_per_line, 1, 13, 16,
 | 
					 | 
				
			||||||
                            hfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
 | 
					 | 
				
			||||||
                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,
 | 
					 | 
				
			||||||
                            hfilter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
 | 
					 | 
				
			||||||
                             16, 8, 8, 8, vfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
 | 
					 | 
				
			||||||
                             16, 8, 8, 8, vfilter);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                               int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                               int  xoffset,
 | 
					 | 
				
			||||||
                               int  yoffset,
 | 
					 | 
				
			||||||
                               unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                               int  dst_pitch) {
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict8x4_mmx\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
 | 
					 | 
				
			||||||
                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
 | 
					 | 
				
			||||||
                             16, 8, 4, 8, vfilter);
 | 
					 | 
				
			||||||
  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
 | 
					 | 
				
			||||||
                             16, 8, 4, 8, vfilter);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if HAVE_SSE2
 | 
					 | 
				
			||||||
void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                  int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                  int  xoffset,
 | 
					 | 
				
			||||||
                                  int  yoffset,
 | 
					 | 
				
			||||||
                                  unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                  int  dst_pitch) {
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict16x16_sse2\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
 | 
					 | 
				
			||||||
                                   src_pixels_per_line, 1, 21, 32, hfilter);
 | 
					 | 
				
			||||||
      vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
 | 
					 | 
				
			||||||
                                   32, 16, 16, dst_pitch, vfilter);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      /* First-pass only */
 | 
					 | 
				
			||||||
      hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                        dst_ptr, dst_pitch, 16, hfilter);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    /* Second-pass only */
 | 
					 | 
				
			||||||
    vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
 | 
					 | 
				
			||||||
                                 src_pixels_per_line, 21, 32);
 | 
					 | 
				
			||||||
    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
 | 
					 | 
				
			||||||
                                 32, 16, 16, dst_pitch, vfilter);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                int  xoffset,
 | 
					 | 
				
			||||||
                                int  yoffset,
 | 
					 | 
				
			||||||
                                unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                int  dst_pitch) {
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict8x8_sse2\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
 | 
					 | 
				
			||||||
                                  src_pixels_per_line, 1, 13, 16, hfilter);
 | 
					 | 
				
			||||||
      vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
 | 
					 | 
				
			||||||
                                  16, 8, 8, dst_pitch, vfilter);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      /* First-pass only */
 | 
					 | 
				
			||||||
      hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                       dst_ptr, dst_pitch, 8, hfilter);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    /* Second-pass only */
 | 
					 | 
				
			||||||
    vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                     src_pixels_per_line,
 | 
					 | 
				
			||||||
                                     dst_ptr, dst_pitch, 8, vfilter);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                int  xoffset,
 | 
					 | 
				
			||||||
                                int  yoffset,
 | 
					 | 
				
			||||||
                                unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                int  dst_pitch) {
 | 
					 | 
				
			||||||
  /* Temp data bufffer used in filtering */
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
 | 
					 | 
				
			||||||
  const short *hfilter, *vfilter;
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict8x4_sse2\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
 | 
					 | 
				
			||||||
                                  src_pixels_per_line, 1, 9, 16, hfilter);
 | 
					 | 
				
			||||||
      vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
 | 
					 | 
				
			||||||
                                  16, 8, 4, dst_pitch, vfilter);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      /* First-pass only */
 | 
					 | 
				
			||||||
      hfilter = vp9_six_tap_mmx[xoffset];
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                       dst_ptr, dst_pitch, 4, hfilter);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    /* Second-pass only */
 | 
					 | 
				
			||||||
    vfilter = vp9_six_tap_mmx[yoffset];
 | 
					 | 
				
			||||||
    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                     src_pixels_per_line,
 | 
					 | 
				
			||||||
                                     dst_ptr, dst_pitch, 4, vfilter);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if HAVE_SSSE3
 | 
					#if HAVE_SSSE3
 | 
				
			||||||
extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                         unsigned char  *output_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int    output_pitch,
 | 
					 | 
				
			||||||
                                         unsigned int    output_height,
 | 
					 | 
				
			||||||
                                         unsigned int    vp9_filter_index);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                          unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                          unsigned char  *output_ptr,
 | 
					 | 
				
			||||||
                                          unsigned int    output_pitch,
 | 
					 | 
				
			||||||
                                          unsigned int    output_height,
 | 
					 | 
				
			||||||
                                          unsigned int    vp9_filter_index);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                          unsigned int   src_pitch,
 | 
					 | 
				
			||||||
                                          unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                          unsigned int   out_pitch,
 | 
					 | 
				
			||||||
                                          unsigned int   output_height,
 | 
					 | 
				
			||||||
                                          unsigned int   vp9_filter_index);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int   src_pitch,
 | 
					 | 
				
			||||||
                                         unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int   out_pitch,
 | 
					 | 
				
			||||||
                                         unsigned int   output_height,
 | 
					 | 
				
			||||||
                                         unsigned int   vp9_filter_index);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                         unsigned char  *output_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int    output_pitch,
 | 
					 | 
				
			||||||
                                         unsigned int    output_height,
 | 
					 | 
				
			||||||
                                         unsigned int    vp9_filter_index);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int   src_pitch,
 | 
					 | 
				
			||||||
                                         unsigned char *output_ptr,
 | 
					 | 
				
			||||||
                                         unsigned int   out_pitch,
 | 
					 | 
				
			||||||
                                         unsigned int   output_height,
 | 
					 | 
				
			||||||
                                         unsigned int   vp9_filter_index);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                   int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                   int  xoffset,
 | 
					 | 
				
			||||||
                                   int  yoffset,
 | 
					 | 
				
			||||||
                                   unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                   int  dst_pitch) {
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict16x16_ssse3\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                    src_pixels_per_line,
 | 
					 | 
				
			||||||
                                    fdata2, 16, 21, xoffset);
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
 | 
					 | 
				
			||||||
                                    16, yoffset);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      /* First-pass only */
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                    dst_ptr, dst_pitch, 16, xoffset);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    /* Second-pass only */
 | 
					 | 
				
			||||||
    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                  dst_ptr, dst_pitch, 16, yoffset);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                 int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 int  xoffset,
 | 
					 | 
				
			||||||
                                 int  yoffset,
 | 
					 | 
				
			||||||
                                 unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                 int  dst_pitch) {
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict8x8_ssse3\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                   src_pixels_per_line, fdata2, 8, 13, xoffset);
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                   dst_ptr, dst_pitch, 8, xoffset);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    /* Second-pass only */
 | 
					 | 
				
			||||||
    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                 src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 dst_ptr, dst_pitch, 8, yoffset);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                 int  src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 int  xoffset,
 | 
					 | 
				
			||||||
                                 int  yoffset,
 | 
					 | 
				
			||||||
                                 unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                 int  dst_pitch) {
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict8x4_ssse3\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                   src_pixels_per_line, fdata2, 8, 9, xoffset);
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      /* First-pass only */
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                   dst_ptr, dst_pitch, 4, xoffset);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    /* Second-pass only */
 | 
					 | 
				
			||||||
    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                 src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 dst_ptr, dst_pitch, 4, yoffset);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,
 | 
					 | 
				
			||||||
                                 int   src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 int  xoffset,
 | 
					 | 
				
			||||||
                                 int  yoffset,
 | 
					 | 
				
			||||||
                                 unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                 int dst_pitch) {
 | 
					 | 
				
			||||||
  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
 | 
					 | 
				
			||||||
#ifdef ANNOUNCE_FUNCTION
 | 
					 | 
				
			||||||
  printf("vp9_sixtap_predict4x4_ssse3\n");
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (xoffset) {
 | 
					 | 
				
			||||||
    if (yoffset) {
 | 
					 | 
				
			||||||
      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                   src_pixels_per_line, fdata2, 4, 9, xoffset);
 | 
					 | 
				
			||||||
      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
 | 
					 | 
				
			||||||
                                   dst_ptr, dst_pitch, 4, xoffset);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
 | 
					 | 
				
			||||||
                                 src_pixels_per_line,
 | 
					 | 
				
			||||||
                                 dst_ptr, dst_pitch, 4, yoffset);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
 | 
					void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
 | 
				
			||||||
                                   const unsigned int src_pitch,
 | 
					                                   const unsigned int src_pitch,
 | 
				
			||||||
                                   unsigned char *output_ptr,
 | 
					                                   unsigned char *output_ptr,
 | 
				
			||||||
@@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
 | 
				
			|||||||
                                   unsigned int output_height,
 | 
					                                   unsigned int output_height,
 | 
				
			||||||
                                   const short *filter);
 | 
					                                   const short *filter);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
 | 
					 | 
				
			||||||
                                      const unsigned int src_stride,
 | 
					 | 
				
			||||||
                                      const short *hfilter_aligned16,
 | 
					 | 
				
			||||||
                                      const short *vfilter_aligned16,
 | 
					 | 
				
			||||||
                                      unsigned char *dst_ptr,
 | 
					 | 
				
			||||||
                                      unsigned int dst_stride) {
 | 
					 | 
				
			||||||
  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
 | 
					 | 
				
			||||||
    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
 | 
					 | 
				
			||||||
                                  fdata2, 16, 23, hfilter_aligned16);
 | 
					 | 
				
			||||||
    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
 | 
					 | 
				
			||||||
                                  vfilter_aligned16);
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    if (hfilter_aligned16[3] != 128) {
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
 | 
					 | 
				
			||||||
                                    16, hfilter_aligned16);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
 | 
					 | 
				
			||||||
                                    dst_ptr, dst_stride, 16, vfilter_aligned16);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
 | 
					void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
 | 
				
			||||||
                                   const unsigned int src_pitch,
 | 
					                                   const unsigned int src_pitch,
 | 
				
			||||||
                                   unsigned char *output_ptr,
 | 
					                                   unsigned char *output_ptr,
 | 
				
			||||||
@@ -551,51 +65,100 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
 | 
				
			|||||||
                                   unsigned int output_height,
 | 
					                                   unsigned int output_height,
 | 
				
			||||||
                                   const short *filter);
 | 
					                                   const short *filter);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
 | 
					void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
 | 
				
			||||||
                                    const unsigned int src_stride,
 | 
					                               uint8_t *dst, int dst_stride,
 | 
				
			||||||
                                    const short *hfilter_aligned16,
 | 
					                               const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
                                    const short *vfilter_aligned16,
 | 
					                               const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
                                    unsigned char *dst_ptr,
 | 
					                               int w, int h) {
 | 
				
			||||||
                                    unsigned int dst_stride) {
 | 
					  if (x_step_q4 == 16 && filter_x[3] != 128) {
 | 
				
			||||||
  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
 | 
					    while (w >= 16) {
 | 
				
			||||||
    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
 | 
					      vp9_filter_block1d16_h8_ssse3(src, src_stride,
 | 
				
			||||||
 | 
					                                    dst, dst_stride,
 | 
				
			||||||
    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
 | 
					                                    h, filter_x);
 | 
				
			||||||
                                 fdata2, 16, 15, hfilter_aligned16);
 | 
					      src += 16;
 | 
				
			||||||
    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
 | 
					      dst += 16;
 | 
				
			||||||
                                 vfilter_aligned16);
 | 
					      w -= 16;
 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    if (hfilter_aligned16[3] != 128) {
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
 | 
					 | 
				
			||||||
                                   hfilter_aligned16);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
 | 
					 | 
				
			||||||
                                   dst_ptr, dst_stride, 8, vfilter_aligned16);
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    while (w >= 8) {
 | 
				
			||||||
 | 
					      vp9_filter_block1d8_h8_ssse3(src, src_stride,
 | 
				
			||||||
 | 
					                                   dst, dst_stride,
 | 
				
			||||||
 | 
					                                   h, filter_x);
 | 
				
			||||||
 | 
					      src += 8;
 | 
				
			||||||
 | 
					      dst += 8;
 | 
				
			||||||
 | 
					      w -= 8;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if (w) {
 | 
				
			||||||
 | 
					    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                          filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                          w, h);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
 | 
					void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
 | 
				
			||||||
                                    const unsigned int src_stride,
 | 
					                              uint8_t *dst, int dst_stride,
 | 
				
			||||||
                                    const short *hfilter_aligned16,
 | 
					                              const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
                                    const short *vfilter_aligned16,
 | 
					                              const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
                                    unsigned char *dst_ptr,
 | 
					                              int w, int h) {
 | 
				
			||||||
                                    unsigned int dst_stride) {
 | 
					  if (y_step_q4 == 16 && filter_y[3] != 128) {
 | 
				
			||||||
  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
 | 
					    while (w >= 16) {
 | 
				
			||||||
      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
 | 
					      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
 | 
				
			||||||
 | 
					                                    dst, dst_stride,
 | 
				
			||||||
      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
 | 
					                                    h, filter_y);
 | 
				
			||||||
                                   fdata2, 16, 11, hfilter_aligned16);
 | 
					      src += 16;
 | 
				
			||||||
      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
 | 
					      dst += 16;
 | 
				
			||||||
                                   vfilter_aligned16);
 | 
					      w -= 16;
 | 
				
			||||||
  } else {
 | 
					    }
 | 
				
			||||||
    if (hfilter_aligned16[3] != 128) {
 | 
					    while (w >= 8) {
 | 
				
			||||||
      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
 | 
					      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
 | 
				
			||||||
                                   hfilter_aligned16);
 | 
					                                   dst, dst_stride,
 | 
				
			||||||
    } else {
 | 
					                                   h, filter_y);
 | 
				
			||||||
      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
 | 
					      src += 8;
 | 
				
			||||||
                                   dst_ptr, dst_stride, 4, vfilter_aligned16);
 | 
					      dst += 8;
 | 
				
			||||||
 | 
					      w -= 8;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  if (w) {
 | 
				
			||||||
 | 
					    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                         filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                         w, h);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
 | 
				
			||||||
 | 
					                         uint8_t *dst, int dst_stride,
 | 
				
			||||||
 | 
					                         const int16_t *filter_x, int x_step_q4,
 | 
				
			||||||
 | 
					                         const int16_t *filter_y, int y_step_q4,
 | 
				
			||||||
 | 
					                         int w, int h) {
 | 
				
			||||||
 | 
					  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // check w/h due to fixed size fdata2 array
 | 
				
			||||||
 | 
					  assert(w <= 16);
 | 
				
			||||||
 | 
					  assert(h <= 16);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (x_step_q4 == 16 && y_step_q4 == 16 &&
 | 
				
			||||||
 | 
					      filter_x[3] != 128 && filter_y[3] != 128) {
 | 
				
			||||||
 | 
					    if (w == 16) {
 | 
				
			||||||
 | 
					      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
 | 
				
			||||||
 | 
					                                    fdata2, 16,
 | 
				
			||||||
 | 
					                                    h + 7, filter_x);
 | 
				
			||||||
 | 
					      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
 | 
				
			||||||
 | 
					                                    dst, dst_stride,
 | 
				
			||||||
 | 
					                                    h, filter_y);
 | 
				
			||||||
 | 
					      return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if (w == 8) {
 | 
				
			||||||
 | 
					      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
 | 
				
			||||||
 | 
					                                   fdata2, 16,
 | 
				
			||||||
 | 
					                                   h + 7, filter_x);
 | 
				
			||||||
 | 
					      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
 | 
				
			||||||
 | 
					                                   dst, dst_stride,
 | 
				
			||||||
 | 
					                                   h, filter_y);
 | 
				
			||||||
 | 
					      return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  vp9_convolve8_c(src, src_stride, dst, dst_stride,
 | 
				
			||||||
 | 
					                  filter_x, x_step_q4, filter_y, y_step_q4,
 | 
				
			||||||
 | 
					                  w, h);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,290 +0,0 @@
 | 
				
			|||||||
/*
 | 
					 | 
				
			||||||
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *  Use of this source code is governed by a BSD-style license
 | 
					 | 
				
			||||||
 *  that can be found in the LICENSE file in the root of the source
 | 
					 | 
				
			||||||
 *  tree. An additional intellectual property rights grant can be found
 | 
					 | 
				
			||||||
 *  in the file PATENTS.  All contributing project authors may
 | 
					 | 
				
			||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <assert.h> // for alignment checks
 | 
					 | 
				
			||||||
#include <emmintrin.h> // SSE2
 | 
					 | 
				
			||||||
#include "vp9/common/vp9_filter.h"
 | 
					 | 
				
			||||||
#include "vpx_ports/emmintrin_compat.h"
 | 
					 | 
				
			||||||
#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
 | 
					 | 
				
			||||||
#include "vp9_rtcd.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
 | 
					 | 
				
			||||||
//           just a quick partial snapshot so that other can already use some
 | 
					 | 
				
			||||||
//           speedup.
 | 
					 | 
				
			||||||
// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
 | 
					 | 
				
			||||||
//           filtering.
 | 
					 | 
				
			||||||
// TODO(cd): Add some comments, better variable naming.
 | 
					 | 
				
			||||||
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
 | 
					 | 
				
			||||||
//           of positive above 128), or have higher precision filter
 | 
					 | 
				
			||||||
//           coefficients.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Creating a macro to do more than four pixels at once to hide instruction
 | 
					 | 
				
			||||||
// latency is actually slower :-(
 | 
					 | 
				
			||||||
#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
 | 
					 | 
				
			||||||
  {                                                                            \
 | 
					 | 
				
			||||||
  /* Do shifted load to achieve require shuffles through unpacking */          \
 | 
					 | 
				
			||||||
  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
 | 
					 | 
				
			||||||
  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
 | 
					 | 
				
			||||||
  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
 | 
					 | 
				
			||||||
  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
 | 
					 | 
				
			||||||
  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
 | 
					 | 
				
			||||||
  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
 | 
					 | 
				
			||||||
  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
 | 
					 | 
				
			||||||
  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
 | 
					 | 
				
			||||||
  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
 | 
					 | 
				
			||||||
  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
 | 
					 | 
				
			||||||
  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
 | 
					 | 
				
			||||||
  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
 | 
					 | 
				
			||||||
  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
 | 
					 | 
				
			||||||
  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
 | 
					 | 
				
			||||||
  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
 | 
					 | 
				
			||||||
  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
 | 
					 | 
				
			||||||
  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
 | 
					 | 
				
			||||||
  /* multiply accumulate them */                                               \
 | 
					 | 
				
			||||||
  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
 | 
					 | 
				
			||||||
  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
 | 
					 | 
				
			||||||
  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
 | 
					 | 
				
			||||||
  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
 | 
					 | 
				
			||||||
  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
 | 
					 | 
				
			||||||
  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
 | 
					 | 
				
			||||||
  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
 | 
					 | 
				
			||||||
  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
 | 
					 | 
				
			||||||
  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_4x4_8_sse2
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  __m128i intermediateA, intermediateB, intermediateC;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  const int kInterp_Extend = 4;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  const __m128i zero = _mm_set1_epi16(0);
 | 
					 | 
				
			||||||
  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // check alignment
 | 
					 | 
				
			||||||
  assert(0 == ((long)HFilter_aligned16)%16);
 | 
					 | 
				
			||||||
  assert(0 == ((long)VFilter_aligned16)%16);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    __m128i transpose3_0;
 | 
					 | 
				
			||||||
    __m128i transpose3_1;
 | 
					 | 
				
			||||||
    __m128i transpose3_2;
 | 
					 | 
				
			||||||
    __m128i transpose3_3;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Horizontal pass (src -> intermediate).
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
 | 
					 | 
				
			||||||
      // get first two columns filter coefficients
 | 
					 | 
				
			||||||
      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
 | 
					 | 
				
			||||||
      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
 | 
					 | 
				
			||||||
      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
 | 
					 | 
				
			||||||
      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
 | 
					 | 
				
			||||||
      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        __m128i mad_all0;
 | 
					 | 
				
			||||||
        __m128i mad_all1;
 | 
					 | 
				
			||||||
        __m128i mad_all2;
 | 
					 | 
				
			||||||
        __m128i mad_all3;
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
 | 
					 | 
				
			||||||
        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
 | 
					 | 
				
			||||||
        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
 | 
					 | 
				
			||||||
        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        src_ptr += src_stride*4;
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
 | 
					 | 
				
			||||||
        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
 | 
					 | 
				
			||||||
        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
 | 
					 | 
				
			||||||
        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        src_ptr += src_stride*4;
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
 | 
					 | 
				
			||||||
        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
 | 
					 | 
				
			||||||
        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
 | 
					 | 
				
			||||||
        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Transpose result (intermediate -> transpose3_x)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
 | 
					 | 
				
			||||||
      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
 | 
					 | 
				
			||||||
      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
 | 
					 | 
				
			||||||
      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
 | 
					 | 
				
			||||||
      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
 | 
					 | 
				
			||||||
      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
 | 
					 | 
				
			||||||
      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
 | 
					 | 
				
			||||||
      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
 | 
					 | 
				
			||||||
      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
 | 
					 | 
				
			||||||
      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
 | 
					 | 
				
			||||||
      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
 | 
					 | 
				
			||||||
      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
 | 
					 | 
				
			||||||
      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
 | 
					 | 
				
			||||||
      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
 | 
					 | 
				
			||||||
      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
 | 
					 | 
				
			||||||
      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
 | 
					 | 
				
			||||||
      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
 | 
					 | 
				
			||||||
      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
 | 
					 | 
				
			||||||
      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
 | 
					 | 
				
			||||||
      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
 | 
					 | 
				
			||||||
      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
 | 
					 | 
				
			||||||
      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
 | 
					 | 
				
			||||||
      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
 | 
					 | 
				
			||||||
      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
 | 
					 | 
				
			||||||
      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
 | 
					 | 
				
			||||||
      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
 | 
					 | 
				
			||||||
      transpose3_0 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose2_2),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(1, 0, 1, 0)));
 | 
					 | 
				
			||||||
      transpose3_1 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose2_2),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(3, 2, 3, 2)));
 | 
					 | 
				
			||||||
      transpose3_2 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose2_3),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(1, 0, 1, 0)));
 | 
					 | 
				
			||||||
      transpose3_3 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose2_3),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(3, 2, 3, 2)));
 | 
					 | 
				
			||||||
      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Vertical pass (transpose3_x -> dst).
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
 | 
					 | 
				
			||||||
      // get first two columns filter coefficients
 | 
					 | 
				
			||||||
      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
 | 
					 | 
				
			||||||
      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
 | 
					 | 
				
			||||||
      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
 | 
					 | 
				
			||||||
      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
 | 
					 | 
				
			||||||
      __m128i col0, col1, col2, col3;
 | 
					 | 
				
			||||||
        DECLARE_ALIGNED(16, unsigned char, temp[32]);
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        _mm_store_si128((__m128i *)temp, transpose3_0);
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(col0, temp, 0);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        _mm_store_si128((__m128i *)temp, transpose3_1);
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(col1, temp, 0);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        _mm_store_si128((__m128i *)temp, transpose3_2);
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(col2, temp, 0);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        _mm_store_si128((__m128i *)temp, transpose3_3);
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(col3, temp, 0);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      // transpose
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
 | 
					 | 
				
			||||||
        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
 | 
					 | 
				
			||||||
        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
 | 
					 | 
				
			||||||
        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
 | 
					 | 
				
			||||||
        col0 = _mm_unpacklo_epi64(T0, T1);
 | 
					 | 
				
			||||||
        col1 = _mm_unpackhi_epi64(T0, T1);
 | 
					 | 
				
			||||||
        col2 = _mm_unpacklo_epi64(T2, T3);
 | 
					 | 
				
			||||||
        col3 = _mm_unpackhi_epi64(T2, T3);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      // saturate to 8 bit
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        col0 = _mm_packs_epi32(col0, col0);
 | 
					 | 
				
			||||||
        col0 = _mm_packus_epi16(col0, col0);
 | 
					 | 
				
			||||||
        col1 = _mm_packs_epi32(col1, col1);
 | 
					 | 
				
			||||||
        col1 = _mm_packus_epi16(col1, col1);
 | 
					 | 
				
			||||||
        col2 = _mm_packs_epi32 (col2, col2);
 | 
					 | 
				
			||||||
        col2 = _mm_packus_epi16(col2, col2);
 | 
					 | 
				
			||||||
        col3 = _mm_packs_epi32 (col3, col3);
 | 
					 | 
				
			||||||
        col3 = _mm_packus_epi16(col3, col3);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      // store
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_8x4_8_sse2
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  int j;
 | 
					 | 
				
			||||||
  for (j=0; j<8; j+=4) {
 | 
					 | 
				
			||||||
    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
 | 
					 | 
				
			||||||
                                  HFilter_aligned16, VFilter_aligned16,
 | 
					 | 
				
			||||||
                                  dst_ptr + j, dst_stride);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_8x8_8_sse2
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  int i, j;
 | 
					 | 
				
			||||||
  for (i=0; i<8; i+=4) {
 | 
					 | 
				
			||||||
    for (j=0; j<8; j+=4) {
 | 
					 | 
				
			||||||
      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
 | 
					 | 
				
			||||||
                                    HFilter_aligned16, VFilter_aligned16,
 | 
					 | 
				
			||||||
                                    dst_ptr + j + i*dst_stride, dst_stride);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_16x16_8_sse2
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  int i, j;
 | 
					 | 
				
			||||||
  for (i=0; i<16; i+=4) {
 | 
					 | 
				
			||||||
    for (j=0; j<16; j+=4) {
 | 
					 | 
				
			||||||
      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
 | 
					 | 
				
			||||||
                                    HFilter_aligned16, VFilter_aligned16,
 | 
					 | 
				
			||||||
                                    dst_ptr + j + i*dst_stride, dst_stride);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,362 +0,0 @@
 | 
				
			|||||||
/*
 | 
					 | 
				
			||||||
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *  Use of this source code is governed by a BSD-style license
 | 
					 | 
				
			||||||
 *  that can be found in the LICENSE file in the root of the source
 | 
					 | 
				
			||||||
 *  tree. An additional intellectual property rights grant can be found
 | 
					 | 
				
			||||||
 *  in the file PATENTS.  All contributing project authors may
 | 
					 | 
				
			||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <assert.h> // for alignment checks
 | 
					 | 
				
			||||||
#include <smmintrin.h> // SSE4.1
 | 
					 | 
				
			||||||
#include "vp9/common/vp9_filter.h"
 | 
					 | 
				
			||||||
#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
 | 
					 | 
				
			||||||
#include "vp9_rtcd.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
 | 
					 | 
				
			||||||
//           just a quick partial snapshot so that other can already use some
 | 
					 | 
				
			||||||
//           speedup.
 | 
					 | 
				
			||||||
// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
 | 
					 | 
				
			||||||
//           filtering.
 | 
					 | 
				
			||||||
// TODO(cd): Reduce source size by using macros instead of current code
 | 
					 | 
				
			||||||
//           duplication.
 | 
					 | 
				
			||||||
// TODO(cd): Add some comments, better variable naming.
 | 
					 | 
				
			||||||
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
 | 
					 | 
				
			||||||
//           of positive above 128), or have higher precision filter
 | 
					 | 
				
			||||||
//           coefficients.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
 | 
					 | 
				
			||||||
  0x00, 0x01,
 | 
					 | 
				
			||||||
  0x01, 0x02,
 | 
					 | 
				
			||||||
  0x02, 0x03,
 | 
					 | 
				
			||||||
  0x03, 0x04,
 | 
					 | 
				
			||||||
  0x02, 0x03,
 | 
					 | 
				
			||||||
  0x03, 0x04,
 | 
					 | 
				
			||||||
  0x04, 0x05,
 | 
					 | 
				
			||||||
  0x05, 0x06,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
 | 
					 | 
				
			||||||
  0x04, 0x05,
 | 
					 | 
				
			||||||
  0x05, 0x06,
 | 
					 | 
				
			||||||
  0x06, 0x07,
 | 
					 | 
				
			||||||
  0x07, 0x08,
 | 
					 | 
				
			||||||
  0x06, 0x07,
 | 
					 | 
				
			||||||
  0x07, 0x08,
 | 
					 | 
				
			||||||
  0x08, 0x09,
 | 
					 | 
				
			||||||
  0x09, 0x0A,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
  VP9_FILTER_WEIGHT >> 1,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
 | 
					 | 
				
			||||||
  0, 4,  8, 12,
 | 
					 | 
				
			||||||
  1, 5,  9, 13,
 | 
					 | 
				
			||||||
  2, 6, 10, 14,
 | 
					 | 
				
			||||||
  3, 7, 11, 15
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Creating a macro to do more than four pixels at once to hide instruction
 | 
					 | 
				
			||||||
// latency is actually slower :-(
 | 
					 | 
				
			||||||
#define DO_FOUR_PIXELS(result, offset)                                         \
 | 
					 | 
				
			||||||
  {                                                                            \
 | 
					 | 
				
			||||||
  /*load pixels*/                                                              \
 | 
					 | 
				
			||||||
  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \
 | 
					 | 
				
			||||||
  /* extract the ones used for first column */                                 \
 | 
					 | 
				
			||||||
  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \
 | 
					 | 
				
			||||||
  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \
 | 
					 | 
				
			||||||
  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \
 | 
					 | 
				
			||||||
  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \
 | 
					 | 
				
			||||||
  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \
 | 
					 | 
				
			||||||
  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \
 | 
					 | 
				
			||||||
  /* multiply accumulate them */                                               \
 | 
					 | 
				
			||||||
  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \
 | 
					 | 
				
			||||||
  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \
 | 
					 | 
				
			||||||
  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \
 | 
					 | 
				
			||||||
  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \
 | 
					 | 
				
			||||||
  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \
 | 
					 | 
				
			||||||
  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \
 | 
					 | 
				
			||||||
  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
 | 
					 | 
				
			||||||
  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
 | 
					 | 
				
			||||||
  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_4x4_8_sse4_1
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  __m128i intermediateA, intermediateB, intermediateC;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  const int kInterp_Extend = 4;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  const __m128i zero = _mm_set1_epi16(0);
 | 
					 | 
				
			||||||
  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
 | 
					 | 
				
			||||||
  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
 | 
					 | 
				
			||||||
  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
 | 
					 | 
				
			||||||
  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // check alignment
 | 
					 | 
				
			||||||
  assert(0 == ((long)HFilter_aligned16)%16);
 | 
					 | 
				
			||||||
  assert(0 == ((long)VFilter_aligned16)%16);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    __m128i transpose3_0;
 | 
					 | 
				
			||||||
    __m128i transpose3_1;
 | 
					 | 
				
			||||||
    __m128i transpose3_2;
 | 
					 | 
				
			||||||
    __m128i transpose3_3;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Horizontal pass (src -> intermediate).
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
 | 
					 | 
				
			||||||
      // get first two columns filter coefficients
 | 
					 | 
				
			||||||
      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
 | 
					 | 
				
			||||||
      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
 | 
					 | 
				
			||||||
      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
 | 
					 | 
				
			||||||
      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
 | 
					 | 
				
			||||||
      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        __m128i mad_all0;
 | 
					 | 
				
			||||||
        __m128i mad_all1;
 | 
					 | 
				
			||||||
        __m128i mad_all2;
 | 
					 | 
				
			||||||
        __m128i mad_all3;
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
 | 
					 | 
				
			||||||
        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
 | 
					 | 
				
			||||||
        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
 | 
					 | 
				
			||||||
        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        src_ptr += src_stride*4;
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
 | 
					 | 
				
			||||||
        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
 | 
					 | 
				
			||||||
        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
 | 
					 | 
				
			||||||
        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        src_ptr += src_stride*4;
 | 
					 | 
				
			||||||
        // --
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
 | 
					 | 
				
			||||||
        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
 | 
					 | 
				
			||||||
        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
 | 
					 | 
				
			||||||
        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
 | 
					 | 
				
			||||||
        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Transpose result (intermediate -> transpose3_x)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
 | 
					 | 
				
			||||||
      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
 | 
					 | 
				
			||||||
      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
 | 
					 | 
				
			||||||
      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
 | 
					 | 
				
			||||||
      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
 | 
					 | 
				
			||||||
      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
 | 
					 | 
				
			||||||
      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
 | 
					 | 
				
			||||||
      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
 | 
					 | 
				
			||||||
      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
 | 
					 | 
				
			||||||
      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
 | 
					 | 
				
			||||||
      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
 | 
					 | 
				
			||||||
      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
 | 
					 | 
				
			||||||
      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
 | 
					 | 
				
			||||||
      transpose3_0 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose1_2),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(0, 0, 1, 0)));
 | 
					 | 
				
			||||||
      transpose3_1 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose1_2),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(1, 1, 3, 2)));
 | 
					 | 
				
			||||||
      transpose3_2 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose1_2),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(2, 2, 1, 0)));
 | 
					 | 
				
			||||||
      transpose3_3 = _mm_castps_si128(
 | 
					 | 
				
			||||||
                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
 | 
					 | 
				
			||||||
                                           _mm_castsi128_ps(transpose1_2),
 | 
					 | 
				
			||||||
                                           _MM_SHUFFLE(3, 3, 3, 2)));
 | 
					 | 
				
			||||||
      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
 | 
					 | 
				
			||||||
      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Vertical pass (transpose3_x -> dst).
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
 | 
					 | 
				
			||||||
      // get first two columns filter coefficients
 | 
					 | 
				
			||||||
      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
 | 
					 | 
				
			||||||
      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
 | 
					 | 
				
			||||||
      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
 | 
					 | 
				
			||||||
      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
 | 
					 | 
				
			||||||
      __m128i col0, col1, col2, col3;
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        //load pixels
 | 
					 | 
				
			||||||
        __m128i src  = transpose3_0;
 | 
					 | 
				
			||||||
        // extract the ones used for first column
 | 
					 | 
				
			||||||
        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
 | 
					 | 
				
			||||||
        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
 | 
					 | 
				
			||||||
        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        // multiply accumulate them
 | 
					 | 
				
			||||||
        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
 | 
					 | 
				
			||||||
        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
 | 
					 | 
				
			||||||
        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
 | 
					 | 
				
			||||||
        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
 | 
					 | 
				
			||||||
        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
 | 
					 | 
				
			||||||
        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
 | 
					 | 
				
			||||||
        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
 | 
					 | 
				
			||||||
        mad_all = _mm_add_epi32(mad_all, rounding);
 | 
					 | 
				
			||||||
        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
 | 
					 | 
				
			||||||
        mad_all = _mm_packs_epi32(mad_all, mad_all);
 | 
					 | 
				
			||||||
        col0 = _mm_packus_epi16(mad_all, mad_all);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        //load pixels
 | 
					 | 
				
			||||||
        __m128i src  = transpose3_1;
 | 
					 | 
				
			||||||
        // extract the ones used for first column
 | 
					 | 
				
			||||||
        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
 | 
					 | 
				
			||||||
        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
 | 
					 | 
				
			||||||
        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        // multiply accumulate them
 | 
					 | 
				
			||||||
        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
 | 
					 | 
				
			||||||
        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
 | 
					 | 
				
			||||||
        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
 | 
					 | 
				
			||||||
        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
 | 
					 | 
				
			||||||
        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
 | 
					 | 
				
			||||||
        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
 | 
					 | 
				
			||||||
        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
 | 
					 | 
				
			||||||
        mad_all = _mm_add_epi32(mad_all, rounding);
 | 
					 | 
				
			||||||
        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
 | 
					 | 
				
			||||||
        mad_all = _mm_packs_epi32(mad_all, mad_all);
 | 
					 | 
				
			||||||
        col1 = _mm_packus_epi16(mad_all, mad_all);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        //load pixels
 | 
					 | 
				
			||||||
        __m128i src  = transpose3_2;
 | 
					 | 
				
			||||||
        // extract the ones used for first column
 | 
					 | 
				
			||||||
        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
 | 
					 | 
				
			||||||
        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
 | 
					 | 
				
			||||||
        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        // multiply accumulate them
 | 
					 | 
				
			||||||
        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
 | 
					 | 
				
			||||||
        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
 | 
					 | 
				
			||||||
        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
 | 
					 | 
				
			||||||
        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
 | 
					 | 
				
			||||||
        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
 | 
					 | 
				
			||||||
        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
 | 
					 | 
				
			||||||
        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
 | 
					 | 
				
			||||||
        mad_all = _mm_add_epi32(mad_all, rounding);
 | 
					 | 
				
			||||||
        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
 | 
					 | 
				
			||||||
        mad_all = _mm_packs_epi32(mad_all, mad_all);
 | 
					 | 
				
			||||||
        col2 = _mm_packus_epi16(mad_all, mad_all);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        //load pixels
 | 
					 | 
				
			||||||
        __m128i src  = transpose3_3;
 | 
					 | 
				
			||||||
        // extract the ones used for first column
 | 
					 | 
				
			||||||
        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
 | 
					 | 
				
			||||||
        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
 | 
					 | 
				
			||||||
        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
 | 
					 | 
				
			||||||
        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
 | 
					 | 
				
			||||||
        // multiply accumulate them
 | 
					 | 
				
			||||||
        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
 | 
					 | 
				
			||||||
        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
 | 
					 | 
				
			||||||
        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
 | 
					 | 
				
			||||||
        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
 | 
					 | 
				
			||||||
        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
 | 
					 | 
				
			||||||
        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
 | 
					 | 
				
			||||||
        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
 | 
					 | 
				
			||||||
        mad_all = _mm_add_epi32(mad_all, rounding);
 | 
					 | 
				
			||||||
        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
 | 
					 | 
				
			||||||
        mad_all = _mm_packs_epi32(mad_all, mad_all);
 | 
					 | 
				
			||||||
        col3 = _mm_packus_epi16(mad_all, mad_all);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
        __m128i col01 = _mm_unpacklo_epi8(col0, col1);
 | 
					 | 
				
			||||||
        __m128i col23 = _mm_unpacklo_epi8(col2, col3);
 | 
					 | 
				
			||||||
        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
 | 
					 | 
				
			||||||
        //TODO(cd): look into Ronald's comment:
 | 
					 | 
				
			||||||
        //    Future suggestion: I believe here, too, you can merge the
 | 
					 | 
				
			||||||
        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that
 | 
					 | 
				
			||||||
        //    you get the data in a single register, and then use pshufb
 | 
					 | 
				
			||||||
        //    (shuffle_epi8()) instead of the unpacks here. Should be
 | 
					 | 
				
			||||||
        //    2+3+2 instructions faster.
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 0]) =
 | 
					 | 
				
			||||||
            _mm_extract_epi32(col0123, 0);
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 1]) =
 | 
					 | 
				
			||||||
            _mm_extract_epi32(col0123, 1);
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 2]) =
 | 
					 | 
				
			||||||
            _mm_extract_epi32(col0123, 2);
 | 
					 | 
				
			||||||
        *((unsigned int *)&dst_ptr[dst_stride * 3]) =
 | 
					 | 
				
			||||||
            _mm_extract_epi32(col0123, 3);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_8x4_8_sse4_1
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  int j;
 | 
					 | 
				
			||||||
  for (j=0; j<8; j+=4) {
 | 
					 | 
				
			||||||
    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
 | 
					 | 
				
			||||||
                                    HFilter_aligned16, VFilter_aligned16,
 | 
					 | 
				
			||||||
                                    dst_ptr + j, dst_stride);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_8x8_8_sse4_1
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  int i, j;
 | 
					 | 
				
			||||||
  for (i=0; i<8; i+=4) {
 | 
					 | 
				
			||||||
    for (j=0; j<8; j+=4) {
 | 
					 | 
				
			||||||
      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
 | 
					 | 
				
			||||||
                                      HFilter_aligned16, VFilter_aligned16,
 | 
					 | 
				
			||||||
                                      dst_ptr + j + i*dst_stride, dst_stride);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void vp9_filter_block2d_16x16_8_sse4_1
 | 
					 | 
				
			||||||
(
 | 
					 | 
				
			||||||
 const unsigned char *src_ptr, const unsigned int src_stride,
 | 
					 | 
				
			||||||
 const short *HFilter_aligned16, const short *VFilter_aligned16,
 | 
					 | 
				
			||||||
 unsigned char *dst_ptr, unsigned int dst_stride
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
  int i, j;
 | 
					 | 
				
			||||||
  for (i=0; i<16; i+=4) {
 | 
					 | 
				
			||||||
    for (j=0; j<16; j+=4) {
 | 
					 | 
				
			||||||
      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
 | 
					 | 
				
			||||||
                                      HFilter_aligned16, VFilter_aligned16,
 | 
					 | 
				
			||||||
                                      dst_ptr + j + i*dst_stride, dst_stride);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,268 +0,0 @@
 | 
				
			|||||||
;
 | 
					 | 
				
			||||||
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
					 | 
				
			||||||
;
 | 
					 | 
				
			||||||
;  Use of this source code is governed by a BSD-style license
 | 
					 | 
				
			||||||
;  that can be found in the LICENSE file in the root of the source
 | 
					 | 
				
			||||||
;  tree. An additional intellectual property rights grant can be found
 | 
					 | 
				
			||||||
;  in the file PATENTS.  All contributing project authors may
 | 
					 | 
				
			||||||
;  be found in the AUTHORS file in the root of the source tree.
 | 
					 | 
				
			||||||
;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%include "vpx_ports/x86_abi_support.asm"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%define BLOCK_HEIGHT_WIDTH 4
 | 
					 | 
				
			||||||
%define vp9_filter_weight 128
 | 
					 | 
				
			||||||
%define VP9_FILTER_SHIFT  7
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
;void vp9_filter_block1d_h6_mmx
 | 
					 | 
				
			||||||
;(
 | 
					 | 
				
			||||||
;    unsigned char   *src_ptr,
 | 
					 | 
				
			||||||
;    unsigned short  *output_ptr,
 | 
					 | 
				
			||||||
;    unsigned int    src_pixels_per_line,
 | 
					 | 
				
			||||||
;    unsigned int    pixel_step,
 | 
					 | 
				
			||||||
;    unsigned int    output_height,
 | 
					 | 
				
			||||||
;    unsigned int    output_width,
 | 
					 | 
				
			||||||
;    short           * vp9_filter
 | 
					 | 
				
			||||||
;)
 | 
					 | 
				
			||||||
global sym(vp9_filter_block1d_h6_mmx) PRIVATE
 | 
					 | 
				
			||||||
sym(vp9_filter_block1d_h6_mmx):
 | 
					 | 
				
			||||||
    push        rbp
 | 
					 | 
				
			||||||
    mov         rbp, rsp
 | 
					 | 
				
			||||||
    SHADOW_ARGS_TO_STACK 7
 | 
					 | 
				
			||||||
    GET_GOT     rbx
 | 
					 | 
				
			||||||
    push        rsi
 | 
					 | 
				
			||||||
    push        rdi
 | 
					 | 
				
			||||||
    ; end prolog
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        mov         rdx,    arg(6) ;vp9_filter
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
 | 
					 | 
				
			||||||
        movq        mm2,    [rdx + 32]         ;
 | 
					 | 
				
			||||||
        movq        mm6,    [rdx + 48]        ;
 | 
					 | 
				
			||||||
        movq        mm7,    [rdx + 64]        ;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        mov         rdi,    arg(1) ;output_ptr
 | 
					 | 
				
			||||||
        mov         rsi,    arg(0) ;src_ptr
 | 
					 | 
				
			||||||
        movsxd      rcx,    dword ptr arg(4) ;output_height
 | 
					 | 
				
			||||||
        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
 | 
					 | 
				
			||||||
        pxor        mm0,    mm0              ; mm0 = 00000000
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
.nextrow:
 | 
					 | 
				
			||||||
        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
 | 
					 | 
				
			||||||
        movq        mm4,    mm3              ; mm4 = p-2..p5
 | 
					 | 
				
			||||||
        psrlq       mm3,    8                ; mm3 = p-1..p5
 | 
					 | 
				
			||||||
        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
 | 
					 | 
				
			||||||
        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm5,    mm4              ; mm5 = p-2..p5
 | 
					 | 
				
			||||||
        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
 | 
					 | 
				
			||||||
        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
 | 
					 | 
				
			||||||
        paddsw      mm3,    mm4              ; mm3 += mm5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm4,    mm5              ; mm4 = p-2..p5;
 | 
					 | 
				
			||||||
        psrlq       mm5,    16               ; mm5 = p0..p5;
 | 
					 | 
				
			||||||
        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
 | 
					 | 
				
			||||||
        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
 | 
					 | 
				
			||||||
        paddsw      mm3,    mm5              ; mm3 += mm5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm5,    mm4              ; mm5 = p-2..p5
 | 
					 | 
				
			||||||
        psrlq       mm4,    24               ; mm4 = p1..p5
 | 
					 | 
				
			||||||
        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
 | 
					 | 
				
			||||||
        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
 | 
					 | 
				
			||||||
        paddsw      mm3,    mm4              ; mm3 += mm5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        ; do outer positive taps
 | 
					 | 
				
			||||||
        movd        mm4,    [rsi+3]
 | 
					 | 
				
			||||||
        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
 | 
					 | 
				
			||||||
        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
 | 
					 | 
				
			||||||
        paddsw      mm3,    mm4              ; mm3 += mm5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
 | 
					 | 
				
			||||||
        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
 | 
					 | 
				
			||||||
        paddsw      mm3,    mm5              ; mm3 += mm5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
 | 
					 | 
				
			||||||
        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128
 | 
					 | 
				
			||||||
        packuswb    mm3,    mm0              ; pack and unpack to saturate
 | 
					 | 
				
			||||||
        punpcklbw   mm3,    mm0              ;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        [rdi],  mm3              ; store the results in the destination
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
%if ABI_IS_32BIT
 | 
					 | 
				
			||||||
        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
 | 
					 | 
				
			||||||
        add         rdi,    rax;
 | 
					 | 
				
			||||||
%else
 | 
					 | 
				
			||||||
        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
 | 
					 | 
				
			||||||
        add         rdi,    rax;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        add         rsi,    r8               ; next line
 | 
					 | 
				
			||||||
%endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        dec         rcx                      ; decrement count
 | 
					 | 
				
			||||||
        jnz         .nextrow                 ; next row
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    ; begin epilog
 | 
					 | 
				
			||||||
    pop rdi
 | 
					 | 
				
			||||||
    pop rsi
 | 
					 | 
				
			||||||
    RESTORE_GOT
 | 
					 | 
				
			||||||
    UNSHADOW_ARGS
 | 
					 | 
				
			||||||
    pop         rbp
 | 
					 | 
				
			||||||
    ret
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
;void vp9_filter_block1dc_v6_mmx
 | 
					 | 
				
			||||||
;(
 | 
					 | 
				
			||||||
;   short *src_ptr,
 | 
					 | 
				
			||||||
;   unsigned char *output_ptr,
 | 
					 | 
				
			||||||
;    int output_pitch,
 | 
					 | 
				
			||||||
;   unsigned int pixels_per_line,
 | 
					 | 
				
			||||||
;   unsigned int pixel_step,
 | 
					 | 
				
			||||||
;   unsigned int output_height,
 | 
					 | 
				
			||||||
;   unsigned int output_width,
 | 
					 | 
				
			||||||
;   short * vp9_filter
 | 
					 | 
				
			||||||
;)
 | 
					 | 
				
			||||||
global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
 | 
					 | 
				
			||||||
sym(vp9_filter_block1dc_v6_mmx):
 | 
					 | 
				
			||||||
    push        rbp
 | 
					 | 
				
			||||||
    mov         rbp, rsp
 | 
					 | 
				
			||||||
    SHADOW_ARGS_TO_STACK 8
 | 
					 | 
				
			||||||
    GET_GOT     rbx
 | 
					 | 
				
			||||||
    push        rsi
 | 
					 | 
				
			||||||
    push        rdi
 | 
					 | 
				
			||||||
    ; end prolog
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq      mm5, [GLOBAL(rd)]
 | 
					 | 
				
			||||||
        push        rbx
 | 
					 | 
				
			||||||
        mov         rbx, arg(7) ;vp9_filter
 | 
					 | 
				
			||||||
        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
 | 
					 | 
				
			||||||
        movq      mm2, [rbx + 32]         ;
 | 
					 | 
				
			||||||
        movq      mm6, [rbx + 48]        ;
 | 
					 | 
				
			||||||
        movq      mm7, [rbx + 64]        ;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
 | 
					 | 
				
			||||||
        mov         rdi, arg(1) ;output_ptr
 | 
					 | 
				
			||||||
        mov         rsi, arg(0) ;src_ptr
 | 
					 | 
				
			||||||
        sub         rsi, rdx
 | 
					 | 
				
			||||||
        sub         rsi, rdx
 | 
					 | 
				
			||||||
        movsxd      rcx, DWORD PTR arg(5) ;output_height
 | 
					 | 
				
			||||||
        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
 | 
					 | 
				
			||||||
        pxor        mm0, mm0              ; mm0 = 00000000
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
.nextrow_cv:
 | 
					 | 
				
			||||||
        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
 | 
					 | 
				
			||||||
        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
 | 
					 | 
				
			||||||
        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
 | 
					 | 
				
			||||||
        paddsw      mm3, mm4              ; mm3 += mm4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
 | 
					 | 
				
			||||||
        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
 | 
					 | 
				
			||||||
        paddsw      mm3, mm4              ; mm3 += mm4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
 | 
					 | 
				
			||||||
        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
 | 
					 | 
				
			||||||
        paddsw      mm3, mm4              ; mm3 += mm4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
 | 
					 | 
				
			||||||
        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
 | 
					 | 
				
			||||||
        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
 | 
					 | 
				
			||||||
        paddsw      mm3, mm4              ; mm3 += mm4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
 | 
					 | 
				
			||||||
        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
 | 
					 | 
				
			||||||
        paddsw      mm3, mm4              ; mm3 += mm4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        paddsw      mm3, mm5               ; mm3 += round value
 | 
					 | 
				
			||||||
        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
 | 
					 | 
				
			||||||
        packuswb    mm3, mm0              ; pack and saturate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        movd        [rdi],mm3             ; store the results in the destination
 | 
					 | 
				
			||||||
        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
 | 
					 | 
				
			||||||
        ; recon block should be in cache this shouldn't cost much.  Its obviously
 | 
					 | 
				
			||||||
        ; avoidable!!!.
 | 
					 | 
				
			||||||
        lea         rdi,  [rdi+rax] ;
 | 
					 | 
				
			||||||
        dec         rcx                   ; decrement count
 | 
					 | 
				
			||||||
        jnz         .nextrow_cv           ; next row
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        pop         rbx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    ; begin epilog
 | 
					 | 
				
			||||||
    pop rdi
 | 
					 | 
				
			||||||
    pop rsi
 | 
					 | 
				
			||||||
    RESTORE_GOT
 | 
					 | 
				
			||||||
    UNSHADOW_ARGS
 | 
					 | 
				
			||||||
    pop         rbp
 | 
					 | 
				
			||||||
    ret
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
SECTION_RODATA
 | 
					 | 
				
			||||||
align 16
 | 
					 | 
				
			||||||
rd:
 | 
					 | 
				
			||||||
    times 4 dw 0x40
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
align 16
 | 
					 | 
				
			||||||
global HIDDEN_DATA(sym(vp9_six_tap_mmx))
 | 
					 | 
				
			||||||
sym(vp9_six_tap_mmx):
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw 128
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw -6
 | 
					 | 
				
			||||||
    times 8 dw 123
 | 
					 | 
				
			||||||
    times 8 dw 12
 | 
					 | 
				
			||||||
    times 8 dw -1
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 2
 | 
					 | 
				
			||||||
    times 8 dw -11
 | 
					 | 
				
			||||||
    times 8 dw 108
 | 
					 | 
				
			||||||
    times 8 dw 36
 | 
					 | 
				
			||||||
    times 8 dw -8
 | 
					 | 
				
			||||||
    times 8 dw 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw -9
 | 
					 | 
				
			||||||
    times 8 dw 93
 | 
					 | 
				
			||||||
    times 8 dw 50
 | 
					 | 
				
			||||||
    times 8 dw -6
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 3
 | 
					 | 
				
			||||||
    times 8 dw -16
 | 
					 | 
				
			||||||
    times 8 dw 77
 | 
					 | 
				
			||||||
    times 8 dw 77
 | 
					 | 
				
			||||||
    times 8 dw -16
 | 
					 | 
				
			||||||
    times 8 dw 3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw -6
 | 
					 | 
				
			||||||
    times 8 dw 50
 | 
					 | 
				
			||||||
    times 8 dw 93
 | 
					 | 
				
			||||||
    times 8 dw -9
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 1
 | 
					 | 
				
			||||||
    times 8 dw -8
 | 
					 | 
				
			||||||
    times 8 dw 36
 | 
					 | 
				
			||||||
    times 8 dw 108
 | 
					 | 
				
			||||||
    times 8 dw -11
 | 
					 | 
				
			||||||
    times 8 dw 2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
    times 8 dw -1
 | 
					 | 
				
			||||||
    times 8 dw 12
 | 
					 | 
				
			||||||
    times 8 dw 123
 | 
					 | 
				
			||||||
    times 8 dw -6
 | 
					 | 
				
			||||||
    times 8 dw 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,109 +0,0 @@
 | 
				
			|||||||
/*
 | 
					 | 
				
			||||||
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *  Use of this source code is governed by a BSD-style license
 | 
					 | 
				
			||||||
 *  that can be found in the LICENSE file in the root of the source
 | 
					 | 
				
			||||||
 *  tree. An additional intellectual property rights grant can be found
 | 
					 | 
				
			||||||
 *  in the file PATENTS.  All contributing project authors may
 | 
					 | 
				
			||||||
 *  be found in the AUTHORS file in the root of the source tree.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
 | 
					 | 
				
			||||||
#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Note:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * This platform is commonly built for runtime CPU detection. If you modify
 | 
					 | 
				
			||||||
 * any of the function mappings present in this file, be sure to also update
 | 
					 | 
				
			||||||
 * them in the function pointer initialization code
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if HAVE_MMX
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if !CONFIG_RUNTIME_CPU_DETECT
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap16x16
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap8x8
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap8x4
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap4x4
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_bilinear16x16
 | 
					 | 
				
			||||||
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if HAVE_SSE2
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if !CONFIG_RUNTIME_CPU_DETECT
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap16x16
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap8x8
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap8x4
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_bilinear16x16
 | 
					 | 
				
			||||||
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_bilinear8x8
 | 
					 | 
				
			||||||
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if HAVE_SSSE3
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
 | 
					 | 
				
			||||||
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if !CONFIG_RUNTIME_CPU_DETECT
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap16x16
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap8x8
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap8x4
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_sixtap4x4
 | 
					 | 
				
			||||||
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_bilinear16x16
 | 
					 | 
				
			||||||
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef  vp9_subpix_bilinear8x8
 | 
					 | 
				
			||||||
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -11,6 +11,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include "vpx_config.h"
 | 
					#include "vpx_config.h"
 | 
				
			||||||
#include "vp9/common/vp9_onyxc_int.h"
 | 
					#include "vp9/common/vp9_onyxc_int.h"
 | 
				
			||||||
 | 
					#include "vp9/common/vp9_reconinter.h"
 | 
				
			||||||
#include "vp9/encoder/vp9_onyx_int.h"
 | 
					#include "vp9/encoder/vp9_onyx_int.h"
 | 
				
			||||||
#include "vp9/common/vp9_systemdependent.h"
 | 
					#include "vp9/common/vp9_systemdependent.h"
 | 
				
			||||||
#include "vp9/encoder/vp9_quantize.h"
 | 
					#include "vp9/encoder/vp9_quantize.h"
 | 
				
			||||||
@@ -3775,6 +3776,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 | 
				
			|||||||
  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
 | 
					  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
 | 
				
			||||||
  cm->new_fb_idx = get_free_fb(cm);
 | 
					  cm->new_fb_idx = get_free_fb(cm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
 | 
				
			||||||
  if (cpi->pass == 1) {
 | 
					  if (cpi->pass == 1) {
 | 
				
			||||||
    Pass1Encode(cpi, size, dest, frame_flags);
 | 
					    Pass1Encode(cpi, size, dest, frame_flags);
 | 
				
			||||||
  } else if (cpi->pass == 2) {
 | 
					  } else if (cpi->pass == 2) {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2237,9 +2237,9 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
 | 
				
			|||||||
      BLOCK *be = &x->block[i];
 | 
					      BLOCK *be = &x->block[i];
 | 
				
			||||||
      int thisdistortion;
 | 
					      int thisdistortion;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4);
 | 
					      vp9_build_inter_predictors_b(bd, 16, &xd->subpix);
 | 
				
			||||||
      if (xd->mode_info_context->mbmi.second_ref_frame > 0)
 | 
					      if (xd->mode_info_context->mbmi.second_ref_frame > 0)
 | 
				
			||||||
        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4);
 | 
					        vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix);
 | 
				
			||||||
      vp9_subtract_b(be, bd, 16);
 | 
					      vp9_subtract_b(be, bd, 16);
 | 
				
			||||||
      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
 | 
					      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
 | 
				
			||||||
      x->quantize_b_4x4(be, bd);
 | 
					      x->quantize_b_4x4(be, bd);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -50,12 +50,11 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
 | 
				
			|||||||
  // Y
 | 
					  // Y
 | 
				
			||||||
  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
 | 
					  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ((mv_row | mv_col) & 7) {
 | 
					  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][0](
 | 
				
			||||||
    xd->subpixel_predict16x16(yptr, stride,
 | 
					      yptr, stride, &pred[0], 16,
 | 
				
			||||||
                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
 | 
					      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
 | 
				
			||||||
  } else {
 | 
					      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
 | 
				
			||||||
    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
 | 
					      16, 16);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // U & V
 | 
					  // U & V
 | 
				
			||||||
  omv_row = mv_row;
 | 
					  omv_row = mv_row;
 | 
				
			||||||
@@ -67,15 +66,17 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
 | 
				
			|||||||
  uptr = u_mb_ptr + offset;
 | 
					  uptr = u_mb_ptr + offset;
 | 
				
			||||||
  vptr = v_mb_ptr + offset;
 | 
					  vptr = v_mb_ptr + offset;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ((omv_row | omv_col) & 15) {
 | 
					  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](
 | 
				
			||||||
    xd->subpixel_predict8x8(uptr, stride,
 | 
					      uptr, stride, &pred[256], 8,
 | 
				
			||||||
                           (omv_col & 15), (omv_row & 15), &pred[256], 8);
 | 
					      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,
 | 
				
			||||||
    xd->subpixel_predict8x8(vptr, stride,
 | 
					      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,
 | 
				
			||||||
                           (omv_col & 15), (omv_row & 15), &pred[320], 8);
 | 
					      8, 8);
 | 
				
			||||||
  } else {
 | 
					
 | 
				
			||||||
    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
 | 
					  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](
 | 
				
			||||||
    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
 | 
					      vptr, stride, &pred[320], 8,
 | 
				
			||||||
  }
 | 
					      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,
 | 
				
			||||||
 | 
					      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,
 | 
				
			||||||
 | 
					      8, 8);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void vp9_temporal_filter_apply_c(uint8_t *frame1,
 | 
					void vp9_temporal_filter_apply_c(uint8_t *frame1,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
  uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering
 | 
					  uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // First filter 1d Horizontal
 | 
					  // First filter 1d Horizontal
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
 | 
				
			||||||
@@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[20 * 16];
 | 
					  uint8_t temp2[20 * 16];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
 | 
				
			||||||
  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
 | 
					  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
 | 
				
			||||||
@@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[20 * 16];
 | 
					  uint8_t temp2[20 * 16];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
 | 
				
			||||||
  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
 | 
					  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
 | 
				
			||||||
@@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[68 * 64];
 | 
					  uint8_t temp2[68 * 64];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
 | 
				
			||||||
                                    1, 65, 64, HFilter);
 | 
					                                    1, 65, 64, HFilter);
 | 
				
			||||||
@@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[36 * 32];
 | 
					  uint8_t temp2[36 * 32];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
 | 
				
			||||||
  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
 | 
					  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
 | 
				
			||||||
@@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[20 * 16];
 | 
					  uint8_t temp2[20 * 16];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
 | 
				
			||||||
  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
 | 
					  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
 | 
				
			||||||
@@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
 | 
				
			|||||||
  uint8_t temp2[20 * 16];
 | 
					  uint8_t temp2[20 * 16];
 | 
				
			||||||
  const int16_t *HFilter, *VFilter;
 | 
					  const int16_t *HFilter, *VFilter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HFilter = vp9_bilinear_filters[xoffset];
 | 
					  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
 | 
				
			||||||
  VFilter = vp9_bilinear_filters[yoffset];
 | 
					  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
 | 
					  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
 | 
				
			||||||
                                    1, 17, 8, HFilter);
 | 
					                                    1, 17, 8, HFilter);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,6 +16,8 @@ VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 | 
				
			|||||||
VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 | 
					VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_blockd.c
 | 
					VP9_COMMON_SRCS-yes += common/vp9_blockd.c
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h
 | 
				
			||||||
 | 
					VP9_COMMON_SRCS-yes += common/vp9_convolve.c
 | 
				
			||||||
 | 
					VP9_COMMON_SRCS-yes += common/vp9_convolve.h
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
 | 
					VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 | 
					VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 | 
				
			||||||
@@ -54,7 +56,6 @@ VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 | 
				
			|||||||
VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 | 
					VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_subpixel.h
 | 
					 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 | 
				
			||||||
VP9_COMMON_SRCS-yes += common/vp9_textblit.h
 | 
					VP9_COMMON_SRCS-yes += common/vp9_textblit.h
 | 
				
			||||||
@@ -79,7 +80,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 | 
				
			|||||||
VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
 | 
					VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h
 | 
					VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h
 | 
				
			||||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h
 | 
					 | 
				
			||||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 | 
					VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 | 
				
			||||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 | 
					VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 | 
				
			||||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 | 
					VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 | 
				
			||||||
@@ -88,7 +88,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 | 
				
			|||||||
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 | 
					VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm
 | 
					 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
 | 
				
			||||||
@@ -96,10 +95,8 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 | 
				
			|||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
 | 
					 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
 | 
					 | 
				
			||||||
ifeq ($(CONFIG_POSTPROC),yes)
 | 
					ifeq ($(CONFIG_POSTPROC),yes)
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 | 
				
			||||||
@@ -111,19 +108,10 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
 | 
				
			|||||||
VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c
 | 
					 | 
				
			||||||
ifeq ($(HAVE_SSE4_1),yes)
 | 
					 | 
				
			||||||
vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4
 | 
					 | 
				
			||||||
vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4
 | 
					 | 
				
			||||||
endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c
 | 
					 | 
				
			||||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
 | 
					VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
 | 
				
			||||||
ifeq ($(HAVE_SSE2),yes)
 | 
					ifeq ($(HAVE_SSE2),yes)
 | 
				
			||||||
vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2
 | 
					 | 
				
			||||||
vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
 | 
					vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
 | 
				
			||||||
vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
 | 
					vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
 | 
				
			||||||
vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2
 | 
					 | 
				
			||||||
vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
 | 
					vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
 | 
				
			||||||
vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
 | 
					vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user