vpx_dsp: apply clang-format
Change-Id: I3ea3e77364879928bd916f2b0a7838073ade5975
This commit is contained in:
		@@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
 | 
			
		||||
                         const uint8_t *b, int b_stride,
 | 
			
		||||
                         int *min, int *max) {
 | 
			
		||||
void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
 | 
			
		||||
                         int b_stride, int *min, int *max) {
 | 
			
		||||
  // Load and concatenate.
 | 
			
		||||
  const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
 | 
			
		||||
                                     vld1_u8(a + a_stride));
 | 
			
		||||
  const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
 | 
			
		||||
                                     vld1_u8(a + 3 * a_stride));
 | 
			
		||||
  const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
 | 
			
		||||
                                     vld1_u8(a + 5 * a_stride));
 | 
			
		||||
  const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
 | 
			
		||||
                                     vld1_u8(a + 7 * a_stride));
 | 
			
		||||
  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
 | 
			
		||||
  const uint8x16_t a23 =
 | 
			
		||||
      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
 | 
			
		||||
  const uint8x16_t a45 =
 | 
			
		||||
      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
 | 
			
		||||
  const uint8x16_t a67 =
 | 
			
		||||
      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
 | 
			
		||||
 | 
			
		||||
  const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
 | 
			
		||||
                                     vld1_u8(b + b_stride));
 | 
			
		||||
  const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
 | 
			
		||||
                                     vld1_u8(b + 3 * b_stride));
 | 
			
		||||
  const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
 | 
			
		||||
                                     vld1_u8(b + 5 * b_stride));
 | 
			
		||||
  const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
 | 
			
		||||
                                     vld1_u8(b + 7 * b_stride));
 | 
			
		||||
  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
 | 
			
		||||
  const uint8x16_t b23 =
 | 
			
		||||
      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
 | 
			
		||||
  const uint8x16_t b45 =
 | 
			
		||||
      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
 | 
			
		||||
  const uint8x16_t b67 =
 | 
			
		||||
      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
 | 
			
		||||
 | 
			
		||||
  // Absolute difference.
 | 
			
		||||
  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
 | 
			
		||||
 
 | 
			
		||||
@@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
 | 
			
		||||
      // 14 15 16 17 54 55 56 57
 | 
			
		||||
      // 24 25 26 27 64 65 66 67
 | 
			
		||||
      // 34 35 36 37 74 75 76 77
 | 
			
		||||
      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
 | 
			
		||||
                                            vreinterpretq_s32_s16(out_2));
 | 
			
		||||
      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
 | 
			
		||||
                                            vreinterpretq_s32_s16(out_3));
 | 
			
		||||
      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
 | 
			
		||||
                                            vreinterpretq_s32_s16(out_6));
 | 
			
		||||
      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
 | 
			
		||||
                                            vreinterpretq_s32_s16(out_7));
 | 
			
		||||
      const int32x4x2_t r02_s32 =
 | 
			
		||||
          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
 | 
			
		||||
      const int32x4x2_t r13_s32 =
 | 
			
		||||
          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
 | 
			
		||||
      const int32x4x2_t r46_s32 =
 | 
			
		||||
          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
 | 
			
		||||
      const int32x4x2_t r57_s32 =
 | 
			
		||||
          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
 | 
			
		||||
      const int16x8x2_t r01_s16 =
 | 
			
		||||
          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
 | 
			
		||||
                    vreinterpretq_s16_s32(r13_s32.val[0]));
 | 
			
		||||
 
 | 
			
		||||
@@ -12,9 +12,8 @@
 | 
			
		||||
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
 | 
			
		||||
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
 | 
			
		||||
                                 int16x8_t *a2, int16x8_t *a3,
 | 
			
		||||
                                 int16x8_t *a4, int16x8_t *a5,
 | 
			
		||||
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
 | 
			
		||||
                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
 | 
			
		||||
                                 int16x8_t *a6, int16x8_t *a7) {
 | 
			
		||||
  const int16x8_t b0 = vaddq_s16(*a0, *a1);
 | 
			
		||||
  const int16x8_t b1 = vsubq_s16(*a0, *a1);
 | 
			
		||||
@@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
 | 
			
		||||
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
 | 
			
		||||
// reversing transpose order which may make it easier for the compiler to
 | 
			
		||||
// reconcile the vtrn.64 moves.
 | 
			
		||||
static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
 | 
			
		||||
                         int16x8_t *a2, int16x8_t *a3,
 | 
			
		||||
                         int16x8_t *a4, int16x8_t *a5,
 | 
			
		||||
static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
 | 
			
		||||
                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
 | 
			
		||||
                         int16x8_t *a6, int16x8_t *a7) {
 | 
			
		||||
  // Swap 64 bit elements. Goes from:
 | 
			
		||||
  // a0: 00 01 02 03 04 05 06 07
 | 
			
		||||
@@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
 | 
			
		||||
  // a1657_hi:
 | 
			
		||||
  // 12 13 28 29 44 45 60 61
 | 
			
		||||
  // 14 15 30 31 46 47 62 63
 | 
			
		||||
  const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
 | 
			
		||||
                                         vreinterpretq_s32_s16(a26_lo));
 | 
			
		||||
  const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
 | 
			
		||||
                                         vreinterpretq_s32_s16(a37_lo));
 | 
			
		||||
  const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
 | 
			
		||||
                                         vreinterpretq_s32_s16(a26_hi));
 | 
			
		||||
  const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
 | 
			
		||||
                                         vreinterpretq_s32_s16(a37_hi));
 | 
			
		||||
  const int32x4x2_t a0246_lo =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
 | 
			
		||||
  const int32x4x2_t a1357_lo =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
 | 
			
		||||
  const int32x4x2_t a0246_hi =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
 | 
			
		||||
  const int32x4x2_t a1357_hi =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
 | 
			
		||||
 | 
			
		||||
  // Swap 16 bit elements resulting in:
 | 
			
		||||
  // b0:
 | 
			
		||||
 
 | 
			
		||||
@@ -13,10 +13,7 @@
 | 
			
		||||
#include "vpx_dsp/inv_txfm.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_1_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8x8_t d2u8, d3u8, d30u8, d31u8;
 | 
			
		||||
  uint64x1_t d2u64, d3u64, d4u64, d5u64;
 | 
			
		||||
  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
 | 
			
		||||
 
 | 
			
		||||
@@ -13,15 +13,10 @@
 | 
			
		||||
#include "./vpx_config.h"
 | 
			
		||||
#include "vpx_dsp/txfm_common.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void TRANSPOSE8X8(
 | 
			
		||||
        int16x8_t *q8s16,
 | 
			
		||||
        int16x8_t *q9s16,
 | 
			
		||||
        int16x8_t *q10s16,
 | 
			
		||||
        int16x8_t *q11s16,
 | 
			
		||||
        int16x8_t *q12s16,
 | 
			
		||||
        int16x8_t *q13s16,
 | 
			
		||||
        int16x8_t *q14s16,
 | 
			
		||||
        int16x8_t *q15s16) {
 | 
			
		||||
static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
 | 
			
		||||
                                int16x8_t *q10s16, int16x8_t *q11s16,
 | 
			
		||||
                                int16x8_t *q12s16, int16x8_t *q13s16,
 | 
			
		||||
                                int16x8_t *q14s16, int16x8_t *q15s16) {
 | 
			
		||||
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
 | 
			
		||||
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
 | 
			
		||||
  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
 | 
			
		||||
@@ -53,14 +48,14 @@ static INLINE void TRANSPOSE8X8(
 | 
			
		||||
  *q14s16 = vcombine_s16(d21s16, d29s16);
 | 
			
		||||
  *q15s16 = vcombine_s16(d23s16, d31s16);
 | 
			
		||||
 | 
			
		||||
    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q10s16));
 | 
			
		||||
    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q11s16));
 | 
			
		||||
    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q14s16));
 | 
			
		||||
    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q15s16));
 | 
			
		||||
  q0x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
 | 
			
		||||
  q1x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
 | 
			
		||||
  q2x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
 | 
			
		||||
  q3x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
 | 
			
		||||
 | 
			
		||||
  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
 | 
			
		||||
                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
 | 
			
		||||
@@ -82,9 +77,7 @@ static INLINE void TRANSPOSE8X8(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass1(
 | 
			
		||||
        int16_t *in,
 | 
			
		||||
        int16_t *out,
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
 | 
			
		||||
                                      int output_stride) {
 | 
			
		||||
  int16x4_t d0s16, d1s16, d2s16, d3s16;
 | 
			
		||||
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
 | 
			
		||||
@@ -122,8 +115,8 @@ void vpx_idct16x16_256_add_neon_pass1(
 | 
			
		||||
  q0x2s16 = vld2q_s16(in);
 | 
			
		||||
  q15s16 = q0x2s16.val[0];
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
  d16s16 = vget_low_s16(q8s16);
 | 
			
		||||
  d17s16 = vget_high_s16(q8s16);
 | 
			
		||||
@@ -320,13 +313,9 @@ void vpx_idct16x16_256_add_neon_pass1(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
        int16_t *src,
 | 
			
		||||
        int16_t *out,
 | 
			
		||||
        int16_t *pass1Output,
 | 
			
		||||
        int16_t skip_adding,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
 | 
			
		||||
                                      int16_t *pass1Output, int16_t skip_adding,
 | 
			
		||||
                                      uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8_t *d;
 | 
			
		||||
  uint8x8_t d12u8, d13u8;
 | 
			
		||||
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
 | 
			
		||||
@@ -367,8 +356,8 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
  q0x2s16 = vld2q_s16(src);
 | 
			
		||||
  q15s16 = q0x2s16.val[0];
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
  d16s16 = vget_low_s16(q8s16);
 | 
			
		||||
  d17s16 = vget_high_s16(q8s16);
 | 
			
		||||
@@ -602,10 +591,10 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    q13s16 = vaddq_s16(q1s16, q14s16);
 | 
			
		||||
    q12s16 = vrshrq_n_s16(q12s16, 6);
 | 
			
		||||
    q13s16 = vrshrq_n_s16(q13s16, 6);
 | 
			
		||||
        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    q12u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q13u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
 | 
			
		||||
    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
@@ -627,10 +616,10 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    q13s16 = vaddq_s16(q11s16, q4s16);
 | 
			
		||||
    q12s16 = vrshrq_n_s16(q12s16, 6);
 | 
			
		||||
    q13s16 = vrshrq_n_s16(q13s16, 6);
 | 
			
		||||
        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    q12u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q13u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
 | 
			
		||||
    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
@@ -652,10 +641,10 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    q13s16 = vaddq_s16(q1s16, q2s16);
 | 
			
		||||
    q12s16 = vrshrq_n_s16(q12s16, 6);
 | 
			
		||||
    q13s16 = vrshrq_n_s16(q13s16, 6);
 | 
			
		||||
        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    q12u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q13u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
 | 
			
		||||
    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
@@ -676,10 +665,10 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    q13s16 = vaddq_s16(q11s16, q8s16);
 | 
			
		||||
    q12s16 = vrshrq_n_s16(q12s16, 6);
 | 
			
		||||
    q13s16 = vrshrq_n_s16(q13s16, 6);
 | 
			
		||||
        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    q12u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q13u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
 | 
			
		||||
    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
@@ -693,8 +682,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q8s16 = vrshrq_n_s16(q8s16, 6);
 | 
			
		||||
        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                         vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
@@ -702,8 +690,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q9s16 = vrshrq_n_s16(q9s16, 6);
 | 
			
		||||
        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
@@ -711,8 +698,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q2s16 = vrshrq_n_s16(q2s16, 6);
 | 
			
		||||
        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
@@ -720,8 +706,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q3s16 = vrshrq_n_s16(q3s16, 6);
 | 
			
		||||
        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
 | 
			
		||||
                         vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
@@ -729,8 +714,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q4s16 = vrshrq_n_s16(q4s16, 6);
 | 
			
		||||
        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
 | 
			
		||||
                         vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
@@ -738,8 +722,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q5s16 = vrshrq_n_s16(q5s16, 6);
 | 
			
		||||
        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
 | 
			
		||||
                         vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
@@ -747,16 +730,16 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    dest += dest_stride;
 | 
			
		||||
    q14s16 = vrshrq_n_s16(q14s16, 6);
 | 
			
		||||
        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q14u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
    d += dest_stride;
 | 
			
		||||
 | 
			
		||||
    d12s64 = vld1_s64((int64_t *)dest);
 | 
			
		||||
    q15s16 = vrshrq_n_s16(q15s16, 6);
 | 
			
		||||
        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
 | 
			
		||||
                          vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    q15u16 =
 | 
			
		||||
        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
 | 
			
		||||
    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
 | 
			
		||||
    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
 | 
			
		||||
  } else {  // skip_adding_dest
 | 
			
		||||
@@ -879,9 +862,7 @@ void vpx_idct16x16_256_add_neon_pass2(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass1(
 | 
			
		||||
        int16_t *in,
 | 
			
		||||
        int16_t *out,
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
 | 
			
		||||
                                     int output_stride) {
 | 
			
		||||
  int16x4_t d4s16;
 | 
			
		||||
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
 | 
			
		||||
@@ -917,8 +898,8 @@ void vpx_idct16x16_10_add_neon_pass1(
 | 
			
		||||
  q0x2s16 = vld2q_s16(in);
 | 
			
		||||
  q15s16 = q0x2s16.val[0];
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
  // stage 3
 | 
			
		||||
  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
 | 
			
		||||
@@ -1017,13 +998,9 @@ void vpx_idct16x16_10_add_neon_pass1(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass2(
 | 
			
		||||
        int16_t *src,
 | 
			
		||||
        int16_t *out,
 | 
			
		||||
        int16_t *pass1Output,
 | 
			
		||||
        int16_t skip_adding,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
 | 
			
		||||
                                     int16_t *pass1Output, int16_t skip_adding,
 | 
			
		||||
                                     uint8_t *dest, int dest_stride) {
 | 
			
		||||
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
 | 
			
		||||
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
 | 
			
		||||
  int16x4_t d20s16, d21s16, d22s16, d23s16;
 | 
			
		||||
@@ -1064,8 +1041,8 @@ void vpx_idct16x16_10_add_neon_pass2(
 | 
			
		||||
  q0x2s16 = vld2q_s16(src);
 | 
			
		||||
  q15s16 = q0x2s16.val[0];
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
  // stage 3
 | 
			
		||||
  q6s16 = vdupq_n_s16(cospi_30_64 * 2);
 | 
			
		||||
 
 | 
			
		||||
@@ -10,24 +10,16 @@
 | 
			
		||||
 | 
			
		||||
#include "vpx_dsp/vpx_dsp_common.h"
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
 | 
			
		||||
                                      int16_t *output,
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
 | 
			
		||||
                                      int output_stride);
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
 | 
			
		||||
                                      int16_t *output,
 | 
			
		||||
                                      int16_t *pass1Output,
 | 
			
		||||
                                      int16_t skip_adding,
 | 
			
		||||
                                      uint8_t *dest,
 | 
			
		||||
                                      int dest_stride);
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
 | 
			
		||||
                                     int16_t *output,
 | 
			
		||||
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
 | 
			
		||||
                                      int16_t *pass1Output, int16_t skip_adding,
 | 
			
		||||
                                      uint8_t *dest, int dest_stride);
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
 | 
			
		||||
                                     int output_stride);
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
 | 
			
		||||
                                     int16_t *output,
 | 
			
		||||
                                     int16_t *pass1Output,
 | 
			
		||||
                                     int16_t skip_adding,
 | 
			
		||||
                                     uint8_t *dest,
 | 
			
		||||
                                     int dest_stride);
 | 
			
		||||
void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
 | 
			
		||||
                                     int16_t *pass1Output, int16_t skip_adding,
 | 
			
		||||
                                     uint8_t *dest, int dest_stride);
 | 
			
		||||
 | 
			
		||||
#if HAVE_NEON_ASM
 | 
			
		||||
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 | 
			
		||||
@@ -35,8 +27,8 @@ extern void vpx_push_neon(int64_t *store);
 | 
			
		||||
extern void vpx_pop_neon(int64_t *store);
 | 
			
		||||
#endif  // HAVE_NEON_ASM
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_256_add_neon(const int16_t *input,
 | 
			
		||||
                                uint8_t *dest, int dest_stride) {
 | 
			
		||||
void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
 | 
			
		||||
                                int dest_stride) {
 | 
			
		||||
#if HAVE_NEON_ASM
 | 
			
		||||
  int64_t store_reg[8];
 | 
			
		||||
#endif
 | 
			
		||||
@@ -56,12 +48,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
 | 
			
		||||
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7
 | 
			
		||||
  // which will be saved into row_idct_output.
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(input+1,
 | 
			
		||||
                                     row_idct_output,
 | 
			
		||||
                                     pass1_output,
 | 
			
		||||
                                     0,
 | 
			
		||||
                                     dest,
 | 
			
		||||
                                     dest_stride);
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
 | 
			
		||||
                                   dest, dest_stride);
 | 
			
		||||
 | 
			
		||||
  /* Parallel idct on the lower 8 rows */
 | 
			
		||||
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
 | 
			
		||||
@@ -71,12 +59,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
 | 
			
		||||
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7
 | 
			
		||||
  // which will be saved into row_idct_output.
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
 | 
			
		||||
                                     row_idct_output+8,
 | 
			
		||||
                                     pass1_output,
 | 
			
		||||
                                     0,
 | 
			
		||||
                                     dest,
 | 
			
		||||
                                     dest_stride);
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
 | 
			
		||||
                                   pass1_output, 0, dest, dest_stride);
 | 
			
		||||
 | 
			
		||||
  /* Parallel idct on the left 8 columns */
 | 
			
		||||
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
 | 
			
		||||
@@ -86,12 +70,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
 | 
			
		||||
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7.
 | 
			
		||||
  // Then add the result to the destination data.
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
 | 
			
		||||
                                     row_idct_output,
 | 
			
		||||
                                     pass1_output,
 | 
			
		||||
                                     1,
 | 
			
		||||
                                     dest,
 | 
			
		||||
                                     dest_stride);
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
 | 
			
		||||
                                   pass1_output, 1, dest, dest_stride);
 | 
			
		||||
 | 
			
		||||
  /* Parallel idct on the right 8 columns */
 | 
			
		||||
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
 | 
			
		||||
@@ -102,11 +82,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7.
 | 
			
		||||
  // Then add the result to the destination data.
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
 | 
			
		||||
                                     row_idct_output+8,
 | 
			
		||||
                                     pass1_output,
 | 
			
		||||
                                     1,
 | 
			
		||||
                                     dest+8,
 | 
			
		||||
                                     dest_stride);
 | 
			
		||||
                                   row_idct_output + 8, pass1_output, 1,
 | 
			
		||||
                                   dest + 8, dest_stride);
 | 
			
		||||
 | 
			
		||||
#if HAVE_NEON_ASM
 | 
			
		||||
  // restore d8-d15 register values.
 | 
			
		||||
@@ -116,8 +93,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct16x16_10_add_neon(const int16_t *input,
 | 
			
		||||
                               uint8_t *dest, int dest_stride) {
 | 
			
		||||
void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
 | 
			
		||||
                               int dest_stride) {
 | 
			
		||||
#if HAVE_NEON_ASM
 | 
			
		||||
  int64_t store_reg[8];
 | 
			
		||||
#endif
 | 
			
		||||
@@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
 | 
			
		||||
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7
 | 
			
		||||
  // which will be saved into row_idct_output.
 | 
			
		||||
  vpx_idct16x16_10_add_neon_pass2(input+1,
 | 
			
		||||
                                        row_idct_output,
 | 
			
		||||
                                        pass1_output,
 | 
			
		||||
                                        0,
 | 
			
		||||
                                        dest,
 | 
			
		||||
                                        dest_stride);
 | 
			
		||||
  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
 | 
			
		||||
                                  dest, dest_stride);
 | 
			
		||||
 | 
			
		||||
  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
 | 
			
		||||
 | 
			
		||||
@@ -154,12 +127,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
 | 
			
		||||
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7.
 | 
			
		||||
  // Then add the result to the destination data.
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
 | 
			
		||||
                                     row_idct_output,
 | 
			
		||||
                                     pass1_output,
 | 
			
		||||
                                     1,
 | 
			
		||||
                                     dest,
 | 
			
		||||
                                     dest_stride);
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
 | 
			
		||||
                                   pass1_output, 1, dest, dest_stride);
 | 
			
		||||
 | 
			
		||||
  /* Parallel idct on the right 8 columns */
 | 
			
		||||
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
 | 
			
		||||
@@ -170,11 +139,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
 | 
			
		||||
  // with result in pass1(pass1_output) to calculate final result in stage 7.
 | 
			
		||||
  // Then add the result to the destination data.
 | 
			
		||||
  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
 | 
			
		||||
                                     row_idct_output+8,
 | 
			
		||||
                                     pass1_output,
 | 
			
		||||
                                     1,
 | 
			
		||||
                                     dest+8,
 | 
			
		||||
                                     dest_stride);
 | 
			
		||||
                                   row_idct_output + 8, pass1_output, 1,
 | 
			
		||||
                                   dest + 8, dest_stride);
 | 
			
		||||
 | 
			
		||||
#if HAVE_NEON_ASM
 | 
			
		||||
  // restore d8-d15 register values.
 | 
			
		||||
 
 | 
			
		||||
@@ -15,16 +15,10 @@
 | 
			
		||||
#include "vpx_dsp/inv_txfm.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void LD_16x8(
 | 
			
		||||
        uint8_t *d,
 | 
			
		||||
        int d_stride,
 | 
			
		||||
        uint8x16_t *q8u8,
 | 
			
		||||
        uint8x16_t *q9u8,
 | 
			
		||||
        uint8x16_t *q10u8,
 | 
			
		||||
        uint8x16_t *q11u8,
 | 
			
		||||
        uint8x16_t *q12u8,
 | 
			
		||||
        uint8x16_t *q13u8,
 | 
			
		||||
        uint8x16_t *q14u8,
 | 
			
		||||
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
 | 
			
		||||
                           uint8x16_t *q9u8, uint8x16_t *q10u8,
 | 
			
		||||
                           uint8x16_t *q11u8, uint8x16_t *q12u8,
 | 
			
		||||
                           uint8x16_t *q13u8, uint8x16_t *q14u8,
 | 
			
		||||
                           uint8x16_t *q15u8) {
 | 
			
		||||
  *q8u8 = vld1q_u8(d);
 | 
			
		||||
  d += d_stride;
 | 
			
		||||
@@ -44,15 +38,10 @@ static INLINE void LD_16x8(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void ADD_DIFF_16x8(
 | 
			
		||||
        uint8x16_t qdiffu8,
 | 
			
		||||
        uint8x16_t *q8u8,
 | 
			
		||||
        uint8x16_t *q9u8,
 | 
			
		||||
        uint8x16_t *q10u8,
 | 
			
		||||
        uint8x16_t *q11u8,
 | 
			
		||||
        uint8x16_t *q12u8,
 | 
			
		||||
        uint8x16_t *q13u8,
 | 
			
		||||
        uint8x16_t *q14u8,
 | 
			
		||||
static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
 | 
			
		||||
                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
 | 
			
		||||
                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
 | 
			
		||||
                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
 | 
			
		||||
                                 uint8x16_t *q15u8) {
 | 
			
		||||
  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
 | 
			
		||||
  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
 | 
			
		||||
@@ -65,15 +54,10 @@ static INLINE void ADD_DIFF_16x8(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void SUB_DIFF_16x8(
 | 
			
		||||
        uint8x16_t qdiffu8,
 | 
			
		||||
        uint8x16_t *q8u8,
 | 
			
		||||
        uint8x16_t *q9u8,
 | 
			
		||||
        uint8x16_t *q10u8,
 | 
			
		||||
        uint8x16_t *q11u8,
 | 
			
		||||
        uint8x16_t *q12u8,
 | 
			
		||||
        uint8x16_t *q13u8,
 | 
			
		||||
        uint8x16_t *q14u8,
 | 
			
		||||
static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
 | 
			
		||||
                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
 | 
			
		||||
                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
 | 
			
		||||
                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
 | 
			
		||||
                                 uint8x16_t *q15u8) {
 | 
			
		||||
  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
 | 
			
		||||
  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
 | 
			
		||||
@@ -86,16 +70,10 @@ static INLINE void SUB_DIFF_16x8(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void ST_16x8(
 | 
			
		||||
        uint8_t *d,
 | 
			
		||||
        int d_stride,
 | 
			
		||||
        uint8x16_t *q8u8,
 | 
			
		||||
        uint8x16_t *q9u8,
 | 
			
		||||
        uint8x16_t *q10u8,
 | 
			
		||||
        uint8x16_t *q11u8,
 | 
			
		||||
        uint8x16_t *q12u8,
 | 
			
		||||
        uint8x16_t *q13u8,
 | 
			
		||||
        uint8x16_t *q14u8,
 | 
			
		||||
static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
 | 
			
		||||
                           uint8x16_t *q9u8, uint8x16_t *q10u8,
 | 
			
		||||
                           uint8x16_t *q11u8, uint8x16_t *q12u8,
 | 
			
		||||
                           uint8x16_t *q13u8, uint8x16_t *q14u8,
 | 
			
		||||
                           uint8x16_t *q15u8) {
 | 
			
		||||
  vst1q_u8(d, *q8u8);
 | 
			
		||||
  d += d_stride;
 | 
			
		||||
@@ -115,10 +93,7 @@ static INLINE void ST_16x8(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct32x32_1_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
 | 
			
		||||
  int i, j, dest_stride8;
 | 
			
		||||
  uint8_t *d;
 | 
			
		||||
@@ -135,12 +110,12 @@ void vpx_idct32x32_1_add_neon(
 | 
			
		||||
    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
 | 
			
		||||
      d = dest;
 | 
			
		||||
      for (j = 0; j < 4; j++) {
 | 
			
		||||
                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
 | 
			
		||||
                                        &q12u8, &q13u8, &q14u8, &q15u8);
 | 
			
		||||
                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
 | 
			
		||||
                                    &q12u8, &q13u8, &q14u8, &q15u8);
 | 
			
		||||
                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
 | 
			
		||||
                                        &q12u8, &q13u8, &q14u8, &q15u8);
 | 
			
		||||
        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
 | 
			
		||||
                &q14u8, &q15u8);
 | 
			
		||||
        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
 | 
			
		||||
                      &q14u8, &q15u8);
 | 
			
		||||
        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
 | 
			
		||||
                &q14u8, &q15u8);
 | 
			
		||||
        d += dest_stride8;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
@@ -151,12 +126,12 @@ void vpx_idct32x32_1_add_neon(
 | 
			
		||||
    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
 | 
			
		||||
      d = dest;
 | 
			
		||||
      for (j = 0; j < 4; j++) {
 | 
			
		||||
                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
 | 
			
		||||
                                        &q12u8, &q13u8, &q14u8, &q15u8);
 | 
			
		||||
                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
 | 
			
		||||
                                    &q12u8, &q13u8, &q14u8, &q15u8);
 | 
			
		||||
                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
 | 
			
		||||
                                        &q12u8, &q13u8, &q14u8, &q15u8);
 | 
			
		||||
        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
 | 
			
		||||
                &q14u8, &q15u8);
 | 
			
		||||
        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
 | 
			
		||||
                      &q14u8, &q15u8);
 | 
			
		||||
        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
 | 
			
		||||
                &q14u8, &q15u8);
 | 
			
		||||
        d += dest_stride8;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -26,13 +26,9 @@
 | 
			
		||||
  vst1q_s16(out + second * 32, qB);
 | 
			
		||||
 | 
			
		||||
#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
 | 
			
		||||
       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
 | 
			
		||||
                                      q6s16, q7s16, q8s16, q9s16);
 | 
			
		||||
static INLINE void __STORE_COMBINE_CENTER_RESULTS(
 | 
			
		||||
        uint8_t *p1,
 | 
			
		||||
        uint8_t *p2,
 | 
			
		||||
        int stride,
 | 
			
		||||
        int16x8_t q6s16,
 | 
			
		||||
  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
 | 
			
		||||
static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
 | 
			
		||||
                                                  int stride, int16x8_t q6s16,
 | 
			
		||||
                                                  int16x8_t q7s16,
 | 
			
		||||
                                                  int16x8_t q8s16,
 | 
			
		||||
                                                  int16x8_t q9s16) {
 | 
			
		||||
@@ -50,14 +46,14 @@ static INLINE void __STORE_COMBINE_CENTER_RESULTS(
 | 
			
		||||
  q9s16 = vrshrq_n_s16(q9s16, 6);
 | 
			
		||||
  q6s16 = vrshrq_n_s16(q6s16, 6);
 | 
			
		||||
 | 
			
		||||
    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d9s16)));
 | 
			
		||||
    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d10s16)));
 | 
			
		||||
    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d11s16)));
 | 
			
		||||
    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d8s16)));
 | 
			
		||||
  q7s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
 | 
			
		||||
  q8s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
 | 
			
		||||
  q9s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
 | 
			
		||||
  q6s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
 | 
			
		||||
 | 
			
		||||
  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
 | 
			
		||||
  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
 | 
			
		||||
@@ -73,14 +69,10 @@ static INLINE void __STORE_COMBINE_CENTER_RESULTS(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
 | 
			
		||||
       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
 | 
			
		||||
                                      q4s16, q5s16, q6s16, q7s16);
 | 
			
		||||
static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
 | 
			
		||||
        uint8_t *p1,
 | 
			
		||||
        uint8_t *p2,
 | 
			
		||||
        int stride,
 | 
			
		||||
        int16x8_t q4s16,
 | 
			
		||||
#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
 | 
			
		||||
  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
 | 
			
		||||
static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
 | 
			
		||||
                                                   int stride, int16x8_t q4s16,
 | 
			
		||||
                                                   int16x8_t q5s16,
 | 
			
		||||
                                                   int16x8_t q6s16,
 | 
			
		||||
                                                   int16x8_t q7s16) {
 | 
			
		||||
@@ -98,14 +90,14 @@ static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
 | 
			
		||||
  q7s16 = vrshrq_n_s16(q7s16, 6);
 | 
			
		||||
  q4s16 = vrshrq_n_s16(q4s16, 6);
 | 
			
		||||
 | 
			
		||||
    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d5s16)));
 | 
			
		||||
    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d6s16)));
 | 
			
		||||
    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d7s16)));
 | 
			
		||||
    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
 | 
			
		||||
                                           vreinterpret_u8_s16(d4s16)));
 | 
			
		||||
  q5s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
 | 
			
		||||
  q6s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
 | 
			
		||||
  q7s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
 | 
			
		||||
  q4s16 = vreinterpretq_s16_u16(
 | 
			
		||||
      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
 | 
			
		||||
 | 
			
		||||
  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
 | 
			
		||||
  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
 | 
			
		||||
@@ -123,13 +115,9 @@ static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
 | 
			
		||||
 | 
			
		||||
#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
 | 
			
		||||
  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
 | 
			
		||||
static INLINE void DO_BUTTERFLY(
 | 
			
		||||
        int16x8_t q14s16,
 | 
			
		||||
        int16x8_t q13s16,
 | 
			
		||||
        int16_t first_const,
 | 
			
		||||
        int16_t second_const,
 | 
			
		||||
        int16x8_t *qAs16,
 | 
			
		||||
        int16x8_t *qBs16) {
 | 
			
		||||
static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
 | 
			
		||||
                                int16_t first_const, int16_t second_const,
 | 
			
		||||
                                int16x8_t *qAs16, int16x8_t *qBs16) {
 | 
			
		||||
  int16x4_t d30s16, d31s16;
 | 
			
		||||
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
 | 
			
		||||
  int16x4_t dCs16, dDs16, dAs16, dBs16;
 | 
			
		||||
@@ -158,16 +146,12 @@ static INLINE void DO_BUTTERFLY(
 | 
			
		||||
  q11s32 = vaddq_s32(q12s32, q11s32);
 | 
			
		||||
  q10s32 = vaddq_s32(q10s32, q15s32);
 | 
			
		||||
 | 
			
		||||
    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
 | 
			
		||||
                          vqrshrn_n_s32(q9s32, 14));
 | 
			
		||||
    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
 | 
			
		||||
                          vqrshrn_n_s32(q10s32, 14));
 | 
			
		||||
  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
 | 
			
		||||
  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void idct32_transpose_pair(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        int16_t *t_buf) {
 | 
			
		||||
static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
 | 
			
		||||
  int16_t *in;
 | 
			
		||||
  int i;
 | 
			
		||||
  const int stride = 32;
 | 
			
		||||
@@ -221,14 +205,14 @@ static INLINE void idct32_transpose_pair(
 | 
			
		||||
    q14s16 = vcombine_s16(d21s16, d29s16);
 | 
			
		||||
    q15s16 = vcombine_s16(d23s16, d31s16);
 | 
			
		||||
 | 
			
		||||
        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
 | 
			
		||||
                            vreinterpretq_s32_s16(q10s16));
 | 
			
		||||
        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
 | 
			
		||||
                            vreinterpretq_s32_s16(q11s16));
 | 
			
		||||
        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
 | 
			
		||||
                            vreinterpretq_s32_s16(q14s16));
 | 
			
		||||
        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
 | 
			
		||||
                            vreinterpretq_s32_s16(q15s16));
 | 
			
		||||
    q0x2s32 =
 | 
			
		||||
        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
 | 
			
		||||
    q1x2s32 =
 | 
			
		||||
        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
 | 
			
		||||
    q2x2s32 =
 | 
			
		||||
        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
 | 
			
		||||
    q3x2s32 =
 | 
			
		||||
        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
 | 
			
		||||
 | 
			
		||||
    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
 | 
			
		||||
                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
 | 
			
		||||
@@ -259,19 +243,12 @@ static INLINE void idct32_transpose_pair(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void idct32_bands_end_1st_pass(
 | 
			
		||||
        int16_t *out,
 | 
			
		||||
        int16x8_t q2s16,
 | 
			
		||||
        int16x8_t q3s16,
 | 
			
		||||
        int16x8_t q6s16,
 | 
			
		||||
        int16x8_t q7s16,
 | 
			
		||||
        int16x8_t q8s16,
 | 
			
		||||
        int16x8_t q9s16,
 | 
			
		||||
        int16x8_t q10s16,
 | 
			
		||||
        int16x8_t q11s16,
 | 
			
		||||
        int16x8_t q12s16,
 | 
			
		||||
        int16x8_t q13s16,
 | 
			
		||||
        int16x8_t q14s16,
 | 
			
		||||
static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
 | 
			
		||||
                                             int16x8_t q3s16, int16x8_t q6s16,
 | 
			
		||||
                                             int16x8_t q7s16, int16x8_t q8s16,
 | 
			
		||||
                                             int16x8_t q9s16, int16x8_t q10s16,
 | 
			
		||||
                                             int16x8_t q11s16, int16x8_t q12s16,
 | 
			
		||||
                                             int16x8_t q13s16, int16x8_t q14s16,
 | 
			
		||||
                                             int16x8_t q15s16) {
 | 
			
		||||
  int16x8_t q0s16, q1s16, q4s16, q5s16;
 | 
			
		||||
 | 
			
		||||
@@ -355,21 +332,10 @@ static INLINE void idct32_bands_end_1st_pass(
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
        int16_t *out,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int stride,
 | 
			
		||||
        int16x8_t q2s16,
 | 
			
		||||
        int16x8_t q3s16,
 | 
			
		||||
        int16x8_t q6s16,
 | 
			
		||||
        int16x8_t q7s16,
 | 
			
		||||
        int16x8_t q8s16,
 | 
			
		||||
        int16x8_t q9s16,
 | 
			
		||||
        int16x8_t q10s16,
 | 
			
		||||
        int16x8_t q11s16,
 | 
			
		||||
        int16x8_t q12s16,
 | 
			
		||||
        int16x8_t q13s16,
 | 
			
		||||
        int16x8_t q14s16,
 | 
			
		||||
        int16x8_t q15s16) {
 | 
			
		||||
    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
 | 
			
		||||
    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
 | 
			
		||||
    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
 | 
			
		||||
    int16x8_t q14s16, int16x8_t q15s16) {
 | 
			
		||||
  uint8_t *r6 = dest + 31 * stride;
 | 
			
		||||
  uint8_t *r7 = dest /* +  0 * stride*/;
 | 
			
		||||
  uint8_t *r9 = dest + 15 * stride;
 | 
			
		||||
@@ -378,7 +344,8 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  int16x8_t q0s16, q1s16, q4s16, q5s16;
 | 
			
		||||
 | 
			
		||||
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
 | 
			
		||||
    r10 += str2; r9 -= str2;
 | 
			
		||||
  r10 += str2;
 | 
			
		||||
  r9 -= str2;
 | 
			
		||||
 | 
			
		||||
  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
 | 
			
		||||
  q4s16 = vaddq_s16(q2s16, q1s16);
 | 
			
		||||
@@ -386,7 +353,8 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  q6s16 = vsubq_s16(q3s16, q0s16);
 | 
			
		||||
  q7s16 = vsubq_s16(q2s16, q1s16);
 | 
			
		||||
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
 | 
			
		||||
    r7 += str2; r6 -= str2;
 | 
			
		||||
  r7 += str2;
 | 
			
		||||
  r6 -= str2;
 | 
			
		||||
 | 
			
		||||
  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
 | 
			
		||||
  q2s16 = vaddq_s16(q10s16, q1s16);
 | 
			
		||||
@@ -400,7 +368,8 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  q6s16 = vsubq_s16(q5s16, q0s16);
 | 
			
		||||
  q7s16 = vsubq_s16(q4s16, q1s16);
 | 
			
		||||
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
 | 
			
		||||
    r10 += str2; r9 -= str2;
 | 
			
		||||
  r10 += str2;
 | 
			
		||||
  r9 -= str2;
 | 
			
		||||
 | 
			
		||||
  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
 | 
			
		||||
  q4s16 = vaddq_s16(q2s16, q1s16);
 | 
			
		||||
@@ -408,7 +377,8 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  q6s16 = vsubq_s16(q3s16, q0s16);
 | 
			
		||||
  q7s16 = vsubq_s16(q2s16, q1s16);
 | 
			
		||||
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
 | 
			
		||||
    r7 += str2; r6 -= str2;
 | 
			
		||||
  r7 += str2;
 | 
			
		||||
  r6 -= str2;
 | 
			
		||||
 | 
			
		||||
  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
 | 
			
		||||
  q2s16 = vaddq_s16(q12s16, q1s16);
 | 
			
		||||
@@ -422,7 +392,8 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  q6s16 = vsubq_s16(q5s16, q0s16);
 | 
			
		||||
  q7s16 = vsubq_s16(q4s16, q1s16);
 | 
			
		||||
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
 | 
			
		||||
    r10 += str2; r9 -= str2;
 | 
			
		||||
  r10 += str2;
 | 
			
		||||
  r9 -= str2;
 | 
			
		||||
 | 
			
		||||
  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
 | 
			
		||||
  q4s16 = vaddq_s16(q2s16, q1s16);
 | 
			
		||||
@@ -430,7 +401,8 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  q6s16 = vsubq_s16(q3s16, q0s16);
 | 
			
		||||
  q7s16 = vsubq_s16(q2s16, q1s16);
 | 
			
		||||
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
 | 
			
		||||
    r7 += str2; r6 -= str2;
 | 
			
		||||
  r7 += str2;
 | 
			
		||||
  r6 -= str2;
 | 
			
		||||
 | 
			
		||||
  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
 | 
			
		||||
  q2s16 = vaddq_s16(q14s16, q1s16);
 | 
			
		||||
@@ -454,10 +426,7 @@ static INLINE void idct32_bands_end_2nd_pass(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct32x32_1024_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int stride) {
 | 
			
		||||
void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
  int i, idct32_pass_loop;
 | 
			
		||||
  int16_t trans_buf[32 * 8];
 | 
			
		||||
  int16_t pass1[32 * 32];
 | 
			
		||||
@@ -466,14 +435,11 @@ void vpx_idct32x32_1024_add_neon(
 | 
			
		||||
  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
 | 
			
		||||
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
 | 
			
		||||
 | 
			
		||||
    for (idct32_pass_loop = 0, out = pass1;
 | 
			
		||||
         idct32_pass_loop < 2;
 | 
			
		||||
  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
 | 
			
		||||
       idct32_pass_loop++,
 | 
			
		||||
      input = pass1,  // the input of pass2 is the result of pass1
 | 
			
		||||
       out = pass2) {
 | 
			
		||||
        for (i = 0;
 | 
			
		||||
             i < 4; i++,
 | 
			
		||||
             input += 32 * 8, out += 8) {  // idct32_bands_loop
 | 
			
		||||
    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
 | 
			
		||||
      idct32_transpose_pair(input, trans_buf);
 | 
			
		||||
 | 
			
		||||
      // -----------------------------------------
 | 
			
		||||
@@ -603,8 +569,7 @@ void vpx_idct32x32_1024_add_neon(
 | 
			
		||||
      // part of stage 7
 | 
			
		||||
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
 | 
			
		||||
      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
 | 
			
		||||
            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
 | 
			
		||||
                                                         &q1s16, &q0s16);
 | 
			
		||||
      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
 | 
			
		||||
      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
 | 
			
		||||
 | 
			
		||||
      // -----------------------------------------
 | 
			
		||||
@@ -704,13 +669,13 @@ void vpx_idct32x32_1024_add_neon(
 | 
			
		||||
      q7s16 = vsubq_s16(q4s16, q1s16);
 | 
			
		||||
 | 
			
		||||
      if (idct32_pass_loop == 0) {
 | 
			
		||||
                idct32_bands_end_1st_pass(out,
 | 
			
		||||
                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
 | 
			
		||||
                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
 | 
			
		||||
        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
 | 
			
		||||
                                  q10s16, q11s16, q12s16, q13s16, q14s16,
 | 
			
		||||
                                  q15s16);
 | 
			
		||||
      } else {
 | 
			
		||||
                idct32_bands_end_2nd_pass(out, dest, stride,
 | 
			
		||||
                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
 | 
			
		||||
                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
 | 
			
		||||
        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
 | 
			
		||||
                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
 | 
			
		||||
                                  q14s16, q15s16);
 | 
			
		||||
        dest += 8;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -13,10 +13,7 @@
 | 
			
		||||
#include "vpx_dsp/inv_txfm.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
void vpx_idct4x4_1_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8x8_t d6u8;
 | 
			
		||||
  uint32x2_t d2u32 = vdup_n_u32(0);
 | 
			
		||||
  uint16x8_t q8u16;
 | 
			
		||||
@@ -37,8 +34,7 @@ void vpx_idct4x4_1_add_neon(
 | 
			
		||||
    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
 | 
			
		||||
    d1 += dest_stride;
 | 
			
		||||
 | 
			
		||||
        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
 | 
			
		||||
                         vreinterpret_u8_u32(d2u32));
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
 | 
			
		||||
    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
 | 
			
		||||
    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
 | 
			
		||||
 
 | 
			
		||||
@@ -10,10 +10,7 @@
 | 
			
		||||
 | 
			
		||||
#include <arm_neon.h>
 | 
			
		||||
 | 
			
		||||
void vpx_idct4x4_16_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8x8_t d26u8, d27u8;
 | 
			
		||||
  uint32x2_t d26u32, d27u32;
 | 
			
		||||
  uint16x8_t q8u16, q9u16;
 | 
			
		||||
@@ -46,8 +43,8 @@ void vpx_idct4x4_16_add_neon(
 | 
			
		||||
  d20s16 = vdup_n_s16(cospi_8_64);
 | 
			
		||||
  d21s16 = vdup_n_s16(cospi_16_64);
 | 
			
		||||
 | 
			
		||||
    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(q9s16));
 | 
			
		||||
  q0x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
 | 
			
		||||
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
 | 
			
		||||
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
 | 
			
		||||
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
 | 
			
		||||
@@ -88,8 +85,8 @@ void vpx_idct4x4_16_add_neon(
 | 
			
		||||
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
 | 
			
		||||
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
 | 
			
		||||
 | 
			
		||||
    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(q9s16));
 | 
			
		||||
  q0x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
 | 
			
		||||
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
 | 
			
		||||
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
 | 
			
		||||
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
 | 
			
		||||
@@ -131,10 +128,8 @@ void vpx_idct4x4_16_add_neon(
 | 
			
		||||
  d += dest_stride;
 | 
			
		||||
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
 | 
			
		||||
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                     vreinterpret_u8_u32(d26u32));
 | 
			
		||||
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                     vreinterpret_u8_u32(d27u32));
 | 
			
		||||
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
 | 
			
		||||
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
 | 
			
		||||
 | 
			
		||||
  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
 
 | 
			
		||||
@@ -13,10 +13,7 @@
 | 
			
		||||
#include "vpx_dsp/inv_txfm.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
void vpx_idct8x8_1_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8x8_t d2u8, d3u8, d30u8, d31u8;
 | 
			
		||||
  uint64x1_t d2u64, d3u64, d4u64, d5u64;
 | 
			
		||||
  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
 | 
			
		||||
 
 | 
			
		||||
@@ -13,15 +13,10 @@
 | 
			
		||||
#include "./vpx_config.h"
 | 
			
		||||
#include "vpx_dsp/txfm_common.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void TRANSPOSE8X8(
 | 
			
		||||
        int16x8_t *q8s16,
 | 
			
		||||
        int16x8_t *q9s16,
 | 
			
		||||
        int16x8_t *q10s16,
 | 
			
		||||
        int16x8_t *q11s16,
 | 
			
		||||
        int16x8_t *q12s16,
 | 
			
		||||
        int16x8_t *q13s16,
 | 
			
		||||
        int16x8_t *q14s16,
 | 
			
		||||
        int16x8_t *q15s16) {
 | 
			
		||||
static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
 | 
			
		||||
                                int16x8_t *q10s16, int16x8_t *q11s16,
 | 
			
		||||
                                int16x8_t *q12s16, int16x8_t *q13s16,
 | 
			
		||||
                                int16x8_t *q14s16, int16x8_t *q15s16) {
 | 
			
		||||
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
 | 
			
		||||
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
 | 
			
		||||
  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
 | 
			
		||||
@@ -53,14 +48,14 @@ static INLINE void TRANSPOSE8X8(
 | 
			
		||||
  *q14s16 = vcombine_s16(d21s16, d29s16);
 | 
			
		||||
  *q15s16 = vcombine_s16(d23s16, d31s16);
 | 
			
		||||
 | 
			
		||||
    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q10s16));
 | 
			
		||||
    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q11s16));
 | 
			
		||||
    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q14s16));
 | 
			
		||||
    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
 | 
			
		||||
                        vreinterpretq_s32_s16(*q15s16));
 | 
			
		||||
  q0x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
 | 
			
		||||
  q1x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
 | 
			
		||||
  q2x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
 | 
			
		||||
  q3x2s32 =
 | 
			
		||||
      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
 | 
			
		||||
 | 
			
		||||
  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
 | 
			
		||||
                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
 | 
			
		||||
@@ -82,15 +77,10 @@ static INLINE void TRANSPOSE8X8(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void IDCT8x8_1D(
 | 
			
		||||
        int16x8_t *q8s16,
 | 
			
		||||
        int16x8_t *q9s16,
 | 
			
		||||
        int16x8_t *q10s16,
 | 
			
		||||
        int16x8_t *q11s16,
 | 
			
		||||
        int16x8_t *q12s16,
 | 
			
		||||
        int16x8_t *q13s16,
 | 
			
		||||
        int16x8_t *q14s16,
 | 
			
		||||
        int16x8_t *q15s16) {
 | 
			
		||||
static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
 | 
			
		||||
                              int16x8_t *q10s16, int16x8_t *q11s16,
 | 
			
		||||
                              int16x8_t *q12s16, int16x8_t *q13s16,
 | 
			
		||||
                              int16x8_t *q14s16, int16x8_t *q15s16) {
 | 
			
		||||
  int16x4_t d0s16, d1s16, d2s16, d3s16;
 | 
			
		||||
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
 | 
			
		||||
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
 | 
			
		||||
@@ -238,10 +228,7 @@ static INLINE void IDCT8x8_1D(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct8x8_64_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8_t *d1, *d2;
 | 
			
		||||
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
 | 
			
		||||
  uint64x1_t d0u64, d1u64, d2u64, d3u64;
 | 
			
		||||
@@ -257,17 +244,17 @@ void vpx_idct8x8_64_add_neon(
 | 
			
		||||
  q14s16 = vld1q_s16(input + 48);
 | 
			
		||||
  q15s16 = vld1q_s16(input + 56);
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
               &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
             &q15s16);
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
               &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
             &q15s16);
 | 
			
		||||
 | 
			
		||||
  q8s16 = vrshrq_n_s16(q8s16, 5);
 | 
			
		||||
  q9s16 = vrshrq_n_s16(q9s16, 5);
 | 
			
		||||
@@ -289,14 +276,10 @@ void vpx_idct8x8_64_add_neon(
 | 
			
		||||
  d3u64 = vld1_u64((uint64_t *)d1);
 | 
			
		||||
  d1 += dest_stride;
 | 
			
		||||
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d0u64));
 | 
			
		||||
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d1u64));
 | 
			
		||||
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d2u64));
 | 
			
		||||
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d3u64));
 | 
			
		||||
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
 | 
			
		||||
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
 | 
			
		||||
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
 | 
			
		||||
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
 | 
			
		||||
 | 
			
		||||
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
@@ -326,14 +309,10 @@ void vpx_idct8x8_64_add_neon(
 | 
			
		||||
  d3u64 = vld1_u64((uint64_t *)d1);
 | 
			
		||||
  d1 += dest_stride;
 | 
			
		||||
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d0u64));
 | 
			
		||||
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d1u64));
 | 
			
		||||
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d2u64));
 | 
			
		||||
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d3u64));
 | 
			
		||||
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
 | 
			
		||||
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
 | 
			
		||||
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
 | 
			
		||||
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
 | 
			
		||||
 | 
			
		||||
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
@@ -351,10 +330,7 @@ void vpx_idct8x8_64_add_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct8x8_12_add_neon(
 | 
			
		||||
        int16_t *input,
 | 
			
		||||
        uint8_t *dest,
 | 
			
		||||
        int dest_stride) {
 | 
			
		||||
void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  uint8_t *d1, *d2;
 | 
			
		||||
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
 | 
			
		||||
  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
 | 
			
		||||
@@ -374,8 +350,8 @@ void vpx_idct8x8_12_add_neon(
 | 
			
		||||
  q14s16 = vld1q_s16(input + 48);
 | 
			
		||||
  q15s16 = vld1q_s16(input + 56);
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
  // First transform rows
 | 
			
		||||
  // stage 1
 | 
			
		||||
@@ -451,11 +427,11 @@ void vpx_idct8x8_12_add_neon(
 | 
			
		||||
  q14s16 = vsubq_s16(q1s16, q6s16);
 | 
			
		||||
  q15s16 = vsubq_s16(q0s16, q7s16);
 | 
			
		||||
 | 
			
		||||
    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
                 &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
               &q15s16);
 | 
			
		||||
 | 
			
		||||
    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
 | 
			
		||||
               &q12s16, &q13s16, &q14s16, &q15s16);
 | 
			
		||||
  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
 | 
			
		||||
             &q15s16);
 | 
			
		||||
 | 
			
		||||
  q8s16 = vrshrq_n_s16(q8s16, 5);
 | 
			
		||||
  q9s16 = vrshrq_n_s16(q9s16, 5);
 | 
			
		||||
@@ -477,14 +453,10 @@ void vpx_idct8x8_12_add_neon(
 | 
			
		||||
  d3u64 = vld1_u64((uint64_t *)d1);
 | 
			
		||||
  d1 += dest_stride;
 | 
			
		||||
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d0u64));
 | 
			
		||||
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d1u64));
 | 
			
		||||
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d2u64));
 | 
			
		||||
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d3u64));
 | 
			
		||||
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
 | 
			
		||||
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
 | 
			
		||||
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
 | 
			
		||||
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
 | 
			
		||||
 | 
			
		||||
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
@@ -514,14 +486,10 @@ void vpx_idct8x8_12_add_neon(
 | 
			
		||||
  d3u64 = vld1_u64((uint64_t *)d1);
 | 
			
		||||
  d1 += dest_stride;
 | 
			
		||||
 | 
			
		||||
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d0u64));
 | 
			
		||||
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
 | 
			
		||||
                     vreinterpret_u8_u64(d1u64));
 | 
			
		||||
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d2u64));
 | 
			
		||||
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
 | 
			
		||||
                      vreinterpret_u8_u64(d3u64));
 | 
			
		||||
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
 | 
			
		||||
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
 | 
			
		||||
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
 | 
			
		||||
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
 | 
			
		||||
 | 
			
		||||
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
 
 | 
			
		||||
@@ -18,9 +18,8 @@
 | 
			
		||||
// DC 4x4
 | 
			
		||||
 | 
			
		||||
// 'do_above' and 'do_left' facilitate branch removal when inlined.
 | 
			
		||||
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
                          const uint8_t *above, const uint8_t *left,
 | 
			
		||||
                          int do_above, int do_left) {
 | 
			
		||||
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
 | 
			
		||||
                          const uint8_t *left, int do_above, int do_left) {
 | 
			
		||||
  uint16x8_t sum_top;
 | 
			
		||||
  uint16x8_t sum_left;
 | 
			
		||||
  uint8x8_t dc0;
 | 
			
		||||
@@ -87,9 +86,8 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
// DC 8x8
 | 
			
		||||
 | 
			
		||||
// 'do_above' and 'do_left' facilitate branch removal when inlined.
 | 
			
		||||
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
                          const uint8_t *above, const uint8_t *left,
 | 
			
		||||
                          int do_above, int do_left) {
 | 
			
		||||
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
 | 
			
		||||
                          const uint8_t *left, int do_above, int do_left) {
 | 
			
		||||
  uint16x8_t sum_top;
 | 
			
		||||
  uint16x8_t sum_left;
 | 
			
		||||
  uint8x8_t dc0;
 | 
			
		||||
@@ -425,8 +423,7 @@ void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
  (void)left;
 | 
			
		||||
 | 
			
		||||
  d0u8 = vld1_u8(above);
 | 
			
		||||
  for (i = 0; i < 8; i++, dst += stride)
 | 
			
		||||
    vst1_u8(dst, d0u8);
 | 
			
		||||
  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -436,8 +433,7 @@ void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
  (void)left;
 | 
			
		||||
 | 
			
		||||
  q0u8 = vld1q_u8(above);
 | 
			
		||||
  for (i = 0; i < 16; i++, dst += stride)
 | 
			
		||||
    vst1q_u8(dst, q0u8);
 | 
			
		||||
  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -608,8 +604,8 @@ void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
 | 
			
		||||
  for (i = 0; i < 4; i++, dst += stride) {
 | 
			
		||||
    q1u16 = vdupq_n_u16((uint16_t)left[i]);
 | 
			
		||||
    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
 | 
			
		||||
                      vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
    q1s16 =
 | 
			
		||||
        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
    d0u8 = vqmovun_s16(q1s16);
 | 
			
		||||
    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
 | 
			
		||||
  }
 | 
			
		||||
@@ -631,26 +627,26 @@ void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
  d20u16 = vget_low_u16(q10u16);
 | 
			
		||||
  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
 | 
			
		||||
    q0u16 = vdupq_lane_u16(d20u16, 0);
 | 
			
		||||
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
 | 
			
		||||
                      vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    q0s16 =
 | 
			
		||||
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    d0u8 = vqmovun_s16(q0s16);
 | 
			
		||||
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
 | 
			
		||||
    dst += stride;
 | 
			
		||||
    q0u16 = vdupq_lane_u16(d20u16, 1);
 | 
			
		||||
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
 | 
			
		||||
                      vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    q0s16 =
 | 
			
		||||
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    d0u8 = vqmovun_s16(q0s16);
 | 
			
		||||
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
 | 
			
		||||
    dst += stride;
 | 
			
		||||
    q0u16 = vdupq_lane_u16(d20u16, 2);
 | 
			
		||||
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
 | 
			
		||||
                      vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    q0s16 =
 | 
			
		||||
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    d0u8 = vqmovun_s16(q0s16);
 | 
			
		||||
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
 | 
			
		||||
    dst += stride;
 | 
			
		||||
    q0u16 = vdupq_lane_u16(d20u16, 3);
 | 
			
		||||
    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
 | 
			
		||||
                      vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    q0s16 =
 | 
			
		||||
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
 | 
			
		||||
    d0u8 = vqmovun_s16(q0s16);
 | 
			
		||||
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
 | 
			
		||||
    dst += stride;
 | 
			
		||||
@@ -677,14 +673,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
 | 
			
		||||
      q0u16 = vdupq_lane_u16(d20u16, 0);
 | 
			
		||||
      q8u16 = vdupq_lane_u16(d20u16, 1);
 | 
			
		||||
      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                        vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                        vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
 | 
			
		||||
                        vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      q1s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q0s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      q11s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q8s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      d2u8 = vqmovun_s16(q1s16);
 | 
			
		||||
      d3u8 = vqmovun_s16(q0s16);
 | 
			
		||||
      d22u8 = vqmovun_s16(q11s16);
 | 
			
		||||
@@ -698,14 +694,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
 | 
			
		||||
      q0u16 = vdupq_lane_u16(d20u16, 2);
 | 
			
		||||
      q8u16 = vdupq_lane_u16(d20u16, 3);
 | 
			
		||||
      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                        vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                        vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
 | 
			
		||||
                        vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      q1s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q0s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      q11s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
 | 
			
		||||
      q8s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
 | 
			
		||||
      d2u8 = vqmovun_s16(q1s16);
 | 
			
		||||
      d3u8 = vqmovun_s16(q0s16);
 | 
			
		||||
      d22u8 = vqmovun_s16(q11s16);
 | 
			
		||||
@@ -742,10 +738,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
    d6u16 = vget_low_u16(q3u16);
 | 
			
		||||
    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
 | 
			
		||||
      q0u16 = vdupq_lane_u16(d6u16, 0);
 | 
			
		||||
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q12s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q10u16));
 | 
			
		||||
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
@@ -761,10 +757,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      dst += stride;
 | 
			
		||||
 | 
			
		||||
      q0u16 = vdupq_lane_u16(d6u16, 1);
 | 
			
		||||
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q12s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q10u16));
 | 
			
		||||
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
@@ -780,10 +776,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      dst += stride;
 | 
			
		||||
 | 
			
		||||
      q0u16 = vdupq_lane_u16(d6u16, 2);
 | 
			
		||||
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q12s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q10u16));
 | 
			
		||||
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
@@ -799,10 +795,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      dst += stride;
 | 
			
		||||
 | 
			
		||||
      q0u16 = vdupq_lane_u16(d6u16, 3);
 | 
			
		||||
      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q12s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
 | 
			
		||||
      q13s16 =
 | 
			
		||||
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
 | 
			
		||||
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
                         vreinterpretq_s16_u16(q10u16));
 | 
			
		||||
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
 | 
			
		||||
 
 | 
			
		||||
@@ -14,8 +14,7 @@
 | 
			
		||||
#include "./vpx_config.h"
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void loop_filter_neon_16(
 | 
			
		||||
        uint8x16_t qblimit,  // blimit
 | 
			
		||||
static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
 | 
			
		||||
                                       uint8x16_t qlimit,   // limit
 | 
			
		||||
                                       uint8x16_t qthresh,  // thresh
 | 
			
		||||
                                       uint8x16_t q3,       // p3
 | 
			
		||||
@@ -78,8 +77,7 @@ static INLINE void loop_filter_neon_16(
 | 
			
		||||
 | 
			
		||||
  q9 = vcgeq_u8(qblimit, q9);
 | 
			
		||||
 | 
			
		||||
    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
 | 
			
		||||
                    vreinterpretq_s8_u8(q8));
 | 
			
		||||
  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
 | 
			
		||||
 | 
			
		||||
  q14u8 = vorrq_u8(q13u8, q14u8);
 | 
			
		||||
 | 
			
		||||
@@ -124,13 +122,10 @@ static INLINE void loop_filter_neon_16(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
 | 
			
		||||
                                    const uint8_t *blimit0,
 | 
			
		||||
                                    const uint8_t *limit0,
 | 
			
		||||
                                    const uint8_t *thresh0,
 | 
			
		||||
                                    const uint8_t *blimit1,
 | 
			
		||||
                                    const uint8_t *limit1,
 | 
			
		||||
                                    const uint8_t *thresh1) {
 | 
			
		||||
void vpx_lpf_horizontal_4_dual_neon(
 | 
			
		||||
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
 | 
			
		||||
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
 | 
			
		||||
    const uint8_t *limit1, const uint8_t *thresh1) {
 | 
			
		||||
  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
 | 
			
		||||
  uint8x16_t qblimit, qlimit, qthresh;
 | 
			
		||||
  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
 | 
			
		||||
@@ -163,9 +158,8 @@ void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
 | 
			
		||||
  s += p;
 | 
			
		||||
  q10u8 = vld1q_u8(s);
 | 
			
		||||
 | 
			
		||||
    loop_filter_neon_16(qblimit, qlimit, qthresh,
 | 
			
		||||
                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
 | 
			
		||||
                        &q5u8, &q6u8, &q7u8, &q8u8);
 | 
			
		||||
  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
 | 
			
		||||
                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
 | 
			
		||||
 | 
			
		||||
  s -= (p * 5);
 | 
			
		||||
  vst1q_u8(s, q5u8);
 | 
			
		||||
 
 | 
			
		||||
@@ -12,8 +12,7 @@
 | 
			
		||||
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void loop_filter_neon(
 | 
			
		||||
        uint8x8_t dblimit,    // flimit
 | 
			
		||||
static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
 | 
			
		||||
                                    uint8x8_t dlimit,    // limit
 | 
			
		||||
                                    uint8x8_t dthresh,   // thresh
 | 
			
		||||
                                    uint8x8_t d3u8,      // p3
 | 
			
		||||
@@ -66,13 +65,11 @@ static INLINE void loop_filter_neon(
 | 
			
		||||
 | 
			
		||||
  d19u8 = vdup_n_u8(3);
 | 
			
		||||
 | 
			
		||||
    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
 | 
			
		||||
                    vreinterpret_s8_u8(d6u8));
 | 
			
		||||
  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
 | 
			
		||||
 | 
			
		||||
  d17u8 = vcge_u8(dblimit, d17u8);
 | 
			
		||||
 | 
			
		||||
    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
 | 
			
		||||
                     vreinterpret_s8_u8(d16u8));
 | 
			
		||||
  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
 | 
			
		||||
 | 
			
		||||
  d22u8 = vorr_u8(d21u8, d22u8);
 | 
			
		||||
 | 
			
		||||
@@ -110,12 +107,8 @@ static INLINE void loop_filter_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_horizontal_4_neon(
 | 
			
		||||
        uint8_t *src,
 | 
			
		||||
        int pitch,
 | 
			
		||||
        const uint8_t *blimit,
 | 
			
		||||
        const uint8_t *limit,
 | 
			
		||||
        const uint8_t *thresh) {
 | 
			
		||||
void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
 | 
			
		||||
                               const uint8_t *limit, const uint8_t *thresh) {
 | 
			
		||||
  int i;
 | 
			
		||||
  uint8_t *s, *psrc;
 | 
			
		||||
  uint8x8_t dblimit, dlimit, dthresh;
 | 
			
		||||
@@ -145,9 +138,8 @@ void vpx_lpf_horizontal_4_neon(
 | 
			
		||||
    s += pitch;
 | 
			
		||||
    d18u8 = vld1_u8(s);
 | 
			
		||||
 | 
			
		||||
        loop_filter_neon(dblimit, dlimit, dthresh,
 | 
			
		||||
                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
 | 
			
		||||
                         &d4u8, &d5u8, &d6u8, &d7u8);
 | 
			
		||||
    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
 | 
			
		||||
                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
 | 
			
		||||
 | 
			
		||||
    s -= (pitch * 5);
 | 
			
		||||
    vst1_u8(s, d4u8);
 | 
			
		||||
@@ -161,12 +153,8 @@ void vpx_lpf_horizontal_4_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_4_neon(
 | 
			
		||||
        uint8_t *src,
 | 
			
		||||
        int pitch,
 | 
			
		||||
        const uint8_t *blimit,
 | 
			
		||||
        const uint8_t *limit,
 | 
			
		||||
        const uint8_t *thresh) {
 | 
			
		||||
void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
 | 
			
		||||
                             const uint8_t *limit, const uint8_t *thresh) {
 | 
			
		||||
  int i, pitch8;
 | 
			
		||||
  uint8_t *s;
 | 
			
		||||
  uint8x8_t dblimit, dlimit, dthresh;
 | 
			
		||||
@@ -200,14 +188,10 @@ void vpx_lpf_vertical_4_neon(
 | 
			
		||||
    s += pitch;
 | 
			
		||||
    d18u8 = vld1_u8(s);
 | 
			
		||||
 | 
			
		||||
        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
 | 
			
		||||
                      vreinterpret_u32_u8(d7u8));
 | 
			
		||||
        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
 | 
			
		||||
                      vreinterpret_u32_u8(d16u8));
 | 
			
		||||
        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
 | 
			
		||||
                      vreinterpret_u32_u8(d17u8));
 | 
			
		||||
        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
 | 
			
		||||
                      vreinterpret_u32_u8(d18u8));
 | 
			
		||||
    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
 | 
			
		||||
    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
 | 
			
		||||
    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
 | 
			
		||||
    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
 | 
			
		||||
 | 
			
		||||
    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
 | 
			
		||||
                      vreinterpret_u16_u32(d2tmp2.val[0]));
 | 
			
		||||
@@ -236,9 +220,8 @@ void vpx_lpf_vertical_4_neon(
 | 
			
		||||
    d17u8 = d2tmp11.val[0];
 | 
			
		||||
    d18u8 = d2tmp11.val[1];
 | 
			
		||||
 | 
			
		||||
        loop_filter_neon(dblimit, dlimit, dthresh,
 | 
			
		||||
                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
 | 
			
		||||
                         &d4u8, &d5u8, &d6u8, &d7u8);
 | 
			
		||||
    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
 | 
			
		||||
                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
 | 
			
		||||
 | 
			
		||||
    d4Result.val[0] = d4u8;
 | 
			
		||||
    d4Result.val[1] = d5u8;
 | 
			
		||||
 
 | 
			
		||||
@@ -12,8 +12,7 @@
 | 
			
		||||
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
 | 
			
		||||
static INLINE void mbloop_filter_neon(
 | 
			
		||||
        uint8x8_t dblimit,   // mblimit
 | 
			
		||||
static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
 | 
			
		||||
                                      uint8x8_t dlimit,    // limit
 | 
			
		||||
                                      uint8x8_t dthresh,   // thresh
 | 
			
		||||
                                      uint8x8_t d3u8,      // p2
 | 
			
		||||
@@ -64,10 +63,8 @@ static INLINE void mbloop_filter_neon(
 | 
			
		||||
  d23u8 = vabd_u8(d5u8, d16u8);
 | 
			
		||||
  d24u8 = vqadd_u8(d24u8, d24u8);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  d19u8 = vcge_u8(dlimit, d19u8);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  d25u8 = vmax_u8(d25u8, d26u8);
 | 
			
		||||
  d26u8 = vmax_u8(d27u8, d28u8);
 | 
			
		||||
 | 
			
		||||
@@ -96,8 +93,7 @@ static INLINE void mbloop_filter_neon(
 | 
			
		||||
 | 
			
		||||
  d23u8 = vorr_u8(d21u8, d23u8);
 | 
			
		||||
 | 
			
		||||
    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
 | 
			
		||||
                          vreinterpret_u16_u8(d21u8));
 | 
			
		||||
  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
 | 
			
		||||
 | 
			
		||||
  d30u8 = vshrn_n_u16(q10u16, 4);
 | 
			
		||||
  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
 | 
			
		||||
@@ -263,12 +259,8 @@ static INLINE void mbloop_filter_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_horizontal_8_neon(
 | 
			
		||||
        uint8_t *src,
 | 
			
		||||
        int pitch,
 | 
			
		||||
        const uint8_t *blimit,
 | 
			
		||||
        const uint8_t *limit,
 | 
			
		||||
        const uint8_t *thresh) {
 | 
			
		||||
void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
 | 
			
		||||
                               const uint8_t *limit, const uint8_t *thresh) {
 | 
			
		||||
  int i;
 | 
			
		||||
  uint8_t *s, *psrc;
 | 
			
		||||
  uint8x8_t dblimit, dlimit, dthresh;
 | 
			
		||||
@@ -299,9 +291,9 @@ void vpx_lpf_horizontal_8_neon(
 | 
			
		||||
    s += pitch;
 | 
			
		||||
    d18u8 = vld1_u8(s);
 | 
			
		||||
 | 
			
		||||
        mbloop_filter_neon(dblimit, dlimit, dthresh,
 | 
			
		||||
                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
 | 
			
		||||
                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 | 
			
		||||
    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
 | 
			
		||||
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
 | 
			
		||||
                       &d5u8);
 | 
			
		||||
 | 
			
		||||
    s -= (pitch * 6);
 | 
			
		||||
    vst1_u8(s, d0u8);
 | 
			
		||||
@@ -319,12 +311,8 @@ void vpx_lpf_horizontal_8_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_8_neon(
 | 
			
		||||
        uint8_t *src,
 | 
			
		||||
        int pitch,
 | 
			
		||||
        const uint8_t *blimit,
 | 
			
		||||
        const uint8_t *limit,
 | 
			
		||||
        const uint8_t *thresh) {
 | 
			
		||||
void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
 | 
			
		||||
                             const uint8_t *limit, const uint8_t *thresh) {
 | 
			
		||||
  int i;
 | 
			
		||||
  uint8_t *s;
 | 
			
		||||
  uint8x8_t dblimit, dlimit, dthresh;
 | 
			
		||||
@@ -359,14 +347,10 @@ void vpx_lpf_vertical_8_neon(
 | 
			
		||||
    s += pitch;
 | 
			
		||||
    d18u8 = vld1_u8(s);
 | 
			
		||||
 | 
			
		||||
        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
 | 
			
		||||
                          vreinterpret_u32_u8(d7u8));
 | 
			
		||||
        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
 | 
			
		||||
                          vreinterpret_u32_u8(d16u8));
 | 
			
		||||
        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
 | 
			
		||||
                          vreinterpret_u32_u8(d17u8));
 | 
			
		||||
        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
 | 
			
		||||
                          vreinterpret_u32_u8(d18u8));
 | 
			
		||||
    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
 | 
			
		||||
    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
 | 
			
		||||
    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
 | 
			
		||||
    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
 | 
			
		||||
 | 
			
		||||
    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
 | 
			
		||||
                      vreinterpret_u16_u32(d2tmp2.val[0]));
 | 
			
		||||
@@ -395,9 +379,9 @@ void vpx_lpf_vertical_8_neon(
 | 
			
		||||
    d17u8 = d2tmp11.val[0];
 | 
			
		||||
    d18u8 = d2tmp11.val[1];
 | 
			
		||||
 | 
			
		||||
        mbloop_filter_neon(dblimit, dlimit, dthresh,
 | 
			
		||||
                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
 | 
			
		||||
                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 | 
			
		||||
    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
 | 
			
		||||
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
 | 
			
		||||
                       &d5u8);
 | 
			
		||||
 | 
			
		||||
    d4Result.val[0] = d0u8;
 | 
			
		||||
    d4Result.val[1] = d1u8;
 | 
			
		||||
 
 | 
			
		||||
@@ -14,42 +14,32 @@
 | 
			
		||||
#include "./vpx_config.h"
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
 | 
			
		||||
                                  const uint8_t *blimit0,
 | 
			
		||||
                                  const uint8_t *limit0,
 | 
			
		||||
                                  const uint8_t *thresh0,
 | 
			
		||||
                                  const uint8_t *blimit1,
 | 
			
		||||
                                  const uint8_t *limit1,
 | 
			
		||||
void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
 | 
			
		||||
                                  const uint8_t *limit0, const uint8_t *thresh0,
 | 
			
		||||
                                  const uint8_t *blimit1, const uint8_t *limit1,
 | 
			
		||||
                                  const uint8_t *thresh1) {
 | 
			
		||||
  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
 | 
			
		||||
  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if HAVE_NEON_ASM
 | 
			
		||||
void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
 | 
			
		||||
                                    const uint8_t *blimit0,
 | 
			
		||||
                                    const uint8_t *limit0,
 | 
			
		||||
                                    const uint8_t *thresh0,
 | 
			
		||||
                                    const uint8_t *blimit1,
 | 
			
		||||
                                    const uint8_t *limit1,
 | 
			
		||||
                                    const uint8_t *thresh1) {
 | 
			
		||||
void vpx_lpf_horizontal_8_dual_neon(
 | 
			
		||||
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
 | 
			
		||||
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
 | 
			
		||||
    const uint8_t *limit1, const uint8_t *thresh1) {
 | 
			
		||||
  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
 | 
			
		||||
  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
 | 
			
		||||
                                  const uint8_t *blimit0,
 | 
			
		||||
                                  const uint8_t *limit0,
 | 
			
		||||
                                  const uint8_t *thresh0,
 | 
			
		||||
                                  const uint8_t *blimit1,
 | 
			
		||||
                                  const uint8_t *limit1,
 | 
			
		||||
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
 | 
			
		||||
                                  const uint8_t *limit0, const uint8_t *thresh0,
 | 
			
		||||
                                  const uint8_t *blimit1, const uint8_t *limit1,
 | 
			
		||||
                                  const uint8_t *thresh1) {
 | 
			
		||||
  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
 | 
			
		||||
  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
 | 
			
		||||
                                   const uint8_t *blimit,
 | 
			
		||||
void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
                                   const uint8_t *limit,
 | 
			
		||||
                                   const uint8_t *thresh) {
 | 
			
		||||
  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
 | 
			
		||||
 
 | 
			
		||||
@@ -16,10 +16,10 @@
 | 
			
		||||
 | 
			
		||||
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
 | 
			
		||||
                                                    const uint16x8_t vec_hi) {
 | 
			
		||||
  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
 | 
			
		||||
                                        vget_high_u16(vec_lo));
 | 
			
		||||
  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
 | 
			
		||||
                                        vget_high_u16(vec_hi));
 | 
			
		||||
  const uint32x4_t vec_l_lo =
 | 
			
		||||
      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
 | 
			
		||||
  const uint32x4_t vec_l_hi =
 | 
			
		||||
      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
 | 
			
		||||
  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
 | 
			
		||||
  const uint64x2_t b = vpaddlq_u32(a);
 | 
			
		||||
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
 | 
			
		||||
@@ -33,8 +33,7 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
 | 
			
		||||
static void sad_neon_64(const uint8x16_t vec_src_00,
 | 
			
		||||
                        const uint8x16_t vec_src_16,
 | 
			
		||||
                        const uint8x16_t vec_src_32,
 | 
			
		||||
                        const uint8x16_t vec_src_48,
 | 
			
		||||
                        const uint8_t *ref,
 | 
			
		||||
                        const uint8x16_t vec_src_48, const uint8_t *ref,
 | 
			
		||||
                        uint16x8_t *vec_sum_ref_lo,
 | 
			
		||||
                        uint16x8_t *vec_sum_ref_hi) {
 | 
			
		||||
  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
 | 
			
		||||
@@ -63,8 +62,7 @@ static void sad_neon_64(const uint8x16_t vec_src_00,
 | 
			
		||||
// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
 | 
			
		||||
// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
 | 
			
		||||
static void sad_neon_32(const uint8x16_t vec_src_00,
 | 
			
		||||
                        const uint8x16_t vec_src_16,
 | 
			
		||||
                        const uint8_t *ref,
 | 
			
		||||
                        const uint8x16_t vec_src_16, const uint8_t *ref,
 | 
			
		||||
                        uint16x8_t *vec_sum_ref_lo,
 | 
			
		||||
                        uint16x8_t *vec_sum_ref_hi) {
 | 
			
		||||
  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
 | 
			
		||||
@@ -148,14 +146,14 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
 | 
			
		||||
    const uint8x16_t vec_src_00 = vld1q_u8(src);
 | 
			
		||||
    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
 | 
			
		||||
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref0,
 | 
			
		||||
                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref1,
 | 
			
		||||
                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref2,
 | 
			
		||||
                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref3,
 | 
			
		||||
                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
 | 
			
		||||
                &vec_sum_ref0_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
 | 
			
		||||
                &vec_sum_ref1_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
 | 
			
		||||
                &vec_sum_ref2_hi);
 | 
			
		||||
    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
 | 
			
		||||
                &vec_sum_ref3_hi);
 | 
			
		||||
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
    ref0 += ref_stride;
 | 
			
		||||
@@ -195,20 +193,20 @@ void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
 | 
			
		||||
    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
 | 
			
		||||
    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
 | 
			
		||||
 | 
			
		||||
    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
 | 
			
		||||
                               vget_low_u8(vec_ref0));
 | 
			
		||||
    vec_sum_ref0_lo =
 | 
			
		||||
        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
 | 
			
		||||
    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
 | 
			
		||||
                               vget_high_u8(vec_ref0));
 | 
			
		||||
    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
 | 
			
		||||
                               vget_low_u8(vec_ref1));
 | 
			
		||||
    vec_sum_ref1_lo =
 | 
			
		||||
        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
 | 
			
		||||
    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
 | 
			
		||||
                               vget_high_u8(vec_ref1));
 | 
			
		||||
    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
 | 
			
		||||
                               vget_low_u8(vec_ref2));
 | 
			
		||||
    vec_sum_ref2_lo =
 | 
			
		||||
        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
 | 
			
		||||
    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
 | 
			
		||||
                               vget_high_u8(vec_ref2));
 | 
			
		||||
    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
 | 
			
		||||
                               vget_low_u8(vec_ref3));
 | 
			
		||||
    vec_sum_ref3_lo =
 | 
			
		||||
        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
 | 
			
		||||
    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
 | 
			
		||||
                               vget_high_u8(vec_ref3));
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -14,11 +14,8 @@
 | 
			
		||||
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sad8x16_neon(
 | 
			
		||||
        unsigned char *src_ptr,
 | 
			
		||||
        int src_stride,
 | 
			
		||||
        unsigned char *ref_ptr,
 | 
			
		||||
        int ref_stride) {
 | 
			
		||||
unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
 | 
			
		||||
                              unsigned char *ref_ptr, int ref_stride) {
 | 
			
		||||
  uint8x8_t d0, d8;
 | 
			
		||||
  uint16x8_t q12;
 | 
			
		||||
  uint32x4_t q1;
 | 
			
		||||
@@ -48,11 +45,8 @@ unsigned int vpx_sad8x16_neon(
 | 
			
		||||
  return vget_lane_u32(d5, 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sad4x4_neon(
 | 
			
		||||
        unsigned char *src_ptr,
 | 
			
		||||
        int src_stride,
 | 
			
		||||
        unsigned char *ref_ptr,
 | 
			
		||||
        int ref_stride) {
 | 
			
		||||
unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
 | 
			
		||||
                             unsigned char *ref_ptr, int ref_stride) {
 | 
			
		||||
  uint8x8_t d0, d8;
 | 
			
		||||
  uint16x8_t q12;
 | 
			
		||||
  uint32x2_t d1;
 | 
			
		||||
@@ -79,11 +73,8 @@ unsigned int vpx_sad4x4_neon(
 | 
			
		||||
  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sad16x8_neon(
 | 
			
		||||
        unsigned char *src_ptr,
 | 
			
		||||
        int src_stride,
 | 
			
		||||
        unsigned char *ref_ptr,
 | 
			
		||||
        int ref_stride) {
 | 
			
		||||
unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
 | 
			
		||||
                              unsigned char *ref_ptr, int ref_stride) {
 | 
			
		||||
  uint8x16_t q0, q4;
 | 
			
		||||
  uint16x8_t q12, q13;
 | 
			
		||||
  uint32x4_t q1;
 | 
			
		||||
@@ -118,10 +109,10 @@ unsigned int vpx_sad16x8_neon(
 | 
			
		||||
 | 
			
		||||
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
 | 
			
		||||
                                                    const uint16x8_t vec_hi) {
 | 
			
		||||
  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
 | 
			
		||||
                                        vget_high_u16(vec_lo));
 | 
			
		||||
  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
 | 
			
		||||
                                        vget_high_u16(vec_hi));
 | 
			
		||||
  const uint32x4_t vec_l_lo =
 | 
			
		||||
      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
 | 
			
		||||
  const uint32x4_t vec_l_hi =
 | 
			
		||||
      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
 | 
			
		||||
  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
 | 
			
		||||
  const uint64x2_t b = vpaddlq_u32(a);
 | 
			
		||||
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
 | 
			
		||||
@@ -208,10 +199,10 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
 | 
			
		||||
    const uint8x16_t vec_ref = vld1q_u8(ref);
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
    ref += ref_stride;
 | 
			
		||||
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
 | 
			
		||||
                            vget_low_u8(vec_ref));
 | 
			
		||||
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
 | 
			
		||||
                            vget_high_u8(vec_ref));
 | 
			
		||||
    vec_accum_lo =
 | 
			
		||||
        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
 | 
			
		||||
    vec_accum_hi =
 | 
			
		||||
        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
 | 
			
		||||
  }
 | 
			
		||||
  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -14,38 +14,22 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_MEDIA
 | 
			
		||||
static const int16_t bilinear_filters_media[8][2] = {
 | 
			
		||||
  { 128,   0 },
 | 
			
		||||
  { 112,  16 },
 | 
			
		||||
  {  96,  32 },
 | 
			
		||||
  {  80,  48 },
 | 
			
		||||
  {  64,  64 },
 | 
			
		||||
  {  48,  80 },
 | 
			
		||||
  {  32,  96 },
 | 
			
		||||
  {  16, 112 }
 | 
			
		||||
};
 | 
			
		||||
static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
 | 
			
		||||
                                                      { 96, 32 }, { 80, 48 },
 | 
			
		||||
                                                      { 64, 64 }, { 48, 80 },
 | 
			
		||||
                                                      { 32, 96 }, { 16, 112 } };
 | 
			
		||||
 | 
			
		||||
extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
 | 
			
		||||
                                                    uint16_t *dst_ptr,
 | 
			
		||||
                                                    uint32_t src_pitch,
 | 
			
		||||
                                                    uint32_t height,
 | 
			
		||||
                                                    uint32_t width,
 | 
			
		||||
                                                    const int16_t *filter);
 | 
			
		||||
extern void vpx_filter_block2d_bil_first_pass_media(
 | 
			
		||||
    const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
 | 
			
		||||
    uint32_t height, uint32_t width, const int16_t *filter);
 | 
			
		||||
 | 
			
		||||
extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
 | 
			
		||||
                                                     uint8_t *dst_ptr,
 | 
			
		||||
                                                     int32_t src_pitch,
 | 
			
		||||
                                                     uint32_t height,
 | 
			
		||||
                                                     uint32_t width,
 | 
			
		||||
                                                     const int16_t *filter);
 | 
			
		||||
extern void vpx_filter_block2d_bil_second_pass_media(
 | 
			
		||||
    const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
 | 
			
		||||
    uint32_t height, uint32_t width, const int16_t *filter);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
 | 
			
		||||
                                             int src_pixels_per_line,
 | 
			
		||||
                                             int xoffset, int yoffset,
 | 
			
		||||
                                             const uint8_t *dst_ptr,
 | 
			
		||||
                                             int dst_pixels_per_line,
 | 
			
		||||
                                             unsigned int *sse) {
 | 
			
		||||
unsigned int vpx_sub_pixel_variance8x8_media(
 | 
			
		||||
    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
 | 
			
		||||
    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
 | 
			
		||||
  uint16_t first_pass[10 * 8];
 | 
			
		||||
  uint8_t second_pass[8 * 8];
 | 
			
		||||
  const int16_t *HFilter, *VFilter;
 | 
			
		||||
@@ -54,51 +38,42 @@ unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
 | 
			
		||||
  VFilter = bilinear_filters_media[yoffset];
 | 
			
		||||
 | 
			
		||||
  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
 | 
			
		||||
                                          src_pixels_per_line,
 | 
			
		||||
                                          9, 8, HFilter);
 | 
			
		||||
  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
 | 
			
		||||
                                           8, 8, 8, VFilter);
 | 
			
		||||
                                          src_pixels_per_line, 9, 8, HFilter);
 | 
			
		||||
  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
 | 
			
		||||
                                           VFilter);
 | 
			
		||||
 | 
			
		||||
  return vpx_variance8x8_media(second_pass, 8, dst_ptr,
 | 
			
		||||
                               dst_pixels_per_line, sse);
 | 
			
		||||
  return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
 | 
			
		||||
                               sse);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
 | 
			
		||||
                                               int src_pixels_per_line,
 | 
			
		||||
                                               int xoffset,
 | 
			
		||||
                                               int yoffset,
 | 
			
		||||
                                               const uint8_t *dst_ptr,
 | 
			
		||||
                                               int dst_pixels_per_line,
 | 
			
		||||
                                               unsigned int *sse) {
 | 
			
		||||
unsigned int vpx_sub_pixel_variance16x16_media(
 | 
			
		||||
    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
 | 
			
		||||
    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
 | 
			
		||||
  uint16_t first_pass[36 * 16];
 | 
			
		||||
  uint8_t second_pass[20 * 16];
 | 
			
		||||
  const int16_t *HFilter, *VFilter;
 | 
			
		||||
  unsigned int var;
 | 
			
		||||
 | 
			
		||||
  if (xoffset == 4 && yoffset == 0) {
 | 
			
		||||
    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
 | 
			
		||||
                                               dst_ptr, dst_pixels_per_line,
 | 
			
		||||
                                               sse);
 | 
			
		||||
    var = vpx_variance_halfpixvar16x16_h_media(
 | 
			
		||||
        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
 | 
			
		||||
  } else if (xoffset == 0 && yoffset == 4) {
 | 
			
		||||
    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
 | 
			
		||||
                                               dst_ptr, dst_pixels_per_line,
 | 
			
		||||
                                               sse);
 | 
			
		||||
    var = vpx_variance_halfpixvar16x16_v_media(
 | 
			
		||||
        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
 | 
			
		||||
  } else if (xoffset == 4 && yoffset == 4) {
 | 
			
		||||
    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
 | 
			
		||||
                                                dst_ptr, dst_pixels_per_line,
 | 
			
		||||
                                                sse);
 | 
			
		||||
    var = vpx_variance_halfpixvar16x16_hv_media(
 | 
			
		||||
        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
 | 
			
		||||
  } else {
 | 
			
		||||
    HFilter = bilinear_filters_media[xoffset];
 | 
			
		||||
    VFilter = bilinear_filters_media[yoffset];
 | 
			
		||||
 | 
			
		||||
    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
 | 
			
		||||
                                            src_pixels_per_line,
 | 
			
		||||
                                            17, 16, HFilter);
 | 
			
		||||
    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
 | 
			
		||||
                                             16, 16, 16, VFilter);
 | 
			
		||||
    vpx_filter_block2d_bil_first_pass_media(
 | 
			
		||||
        src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
 | 
			
		||||
    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
 | 
			
		||||
                                             16, VFilter);
 | 
			
		||||
 | 
			
		||||
    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
 | 
			
		||||
                                  dst_pixels_per_line, sse);
 | 
			
		||||
    var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
 | 
			
		||||
                                  sse);
 | 
			
		||||
  }
 | 
			
		||||
  return var;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -18,14 +18,8 @@
 | 
			
		||||
#include "vpx_dsp/variance.h"
 | 
			
		||||
 | 
			
		||||
static const uint8_t bilinear_filters[8][2] = {
 | 
			
		||||
  { 128,   0, },
 | 
			
		||||
  { 112,  16, },
 | 
			
		||||
  {  96,  32, },
 | 
			
		||||
  {  80,  48, },
 | 
			
		||||
  {  64,  64, },
 | 
			
		||||
  {  48,  80, },
 | 
			
		||||
  {  32,  96, },
 | 
			
		||||
  {  16, 112, },
 | 
			
		||||
  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
 | 
			
		||||
  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
 | 
			
		||||
@@ -79,74 +73,61 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
 | 
			
		||||
                                            int src_stride,
 | 
			
		||||
                                            int xoffset,
 | 
			
		||||
                                            int yoffset,
 | 
			
		||||
                                            const uint8_t *dst,
 | 
			
		||||
                                            int dst_stride,
 | 
			
		||||
unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
 | 
			
		||||
                                            int xoffset, int yoffset,
 | 
			
		||||
                                            const uint8_t *dst, int dst_stride,
 | 
			
		||||
                                            unsigned int *sse) {
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
 | 
			
		||||
 | 
			
		||||
  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
 | 
			
		||||
                            9, 8,
 | 
			
		||||
  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
 | 
			
		||||
                            bilinear_filters[xoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
 | 
			
		||||
                            8, bilinear_filters[yoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
 | 
			
		||||
                            bilinear_filters[yoffset]);
 | 
			
		||||
  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
 | 
			
		||||
                                              int src_stride,
 | 
			
		||||
                                              int xoffset,
 | 
			
		||||
                                              int yoffset,
 | 
			
		||||
                                              const uint8_t *dst,
 | 
			
		||||
                                              int src_stride, int xoffset,
 | 
			
		||||
                                              int yoffset, const uint8_t *dst,
 | 
			
		||||
                                              int dst_stride,
 | 
			
		||||
                                              unsigned int *sse) {
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
 | 
			
		||||
 | 
			
		||||
  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
 | 
			
		||||
                             17, 16,
 | 
			
		||||
  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
 | 
			
		||||
                             bilinear_filters[xoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
 | 
			
		||||
                             16, bilinear_filters[yoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
 | 
			
		||||
                             bilinear_filters[yoffset]);
 | 
			
		||||
  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
 | 
			
		||||
                                              int src_stride,
 | 
			
		||||
                                              int xoffset,
 | 
			
		||||
                                              int yoffset,
 | 
			
		||||
                                              const uint8_t *dst,
 | 
			
		||||
                                              int src_stride, int xoffset,
 | 
			
		||||
                                              int yoffset, const uint8_t *dst,
 | 
			
		||||
                                              int dst_stride,
 | 
			
		||||
                                              unsigned int *sse) {
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
 | 
			
		||||
 | 
			
		||||
  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
 | 
			
		||||
                             33, 32,
 | 
			
		||||
  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
 | 
			
		||||
                             bilinear_filters[xoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
 | 
			
		||||
                             32, bilinear_filters[yoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
 | 
			
		||||
                             bilinear_filters[yoffset]);
 | 
			
		||||
  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
 | 
			
		||||
                                              int src_stride,
 | 
			
		||||
                                              int xoffset,
 | 
			
		||||
                                              int yoffset,
 | 
			
		||||
                                              const uint8_t *dst,
 | 
			
		||||
                                              int src_stride, int xoffset,
 | 
			
		||||
                                              int yoffset, const uint8_t *dst,
 | 
			
		||||
                                              int dst_stride,
 | 
			
		||||
                                              unsigned int *sse) {
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
 | 
			
		||||
  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
 | 
			
		||||
 | 
			
		||||
  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
 | 
			
		||||
                             65, 64,
 | 
			
		||||
  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
 | 
			
		||||
                             bilinear_filters[xoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
 | 
			
		||||
                             64, bilinear_filters[yoffset]);
 | 
			
		||||
  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
 | 
			
		||||
                             bilinear_filters[yoffset]);
 | 
			
		||||
  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -13,10 +13,10 @@
 | 
			
		||||
#include "./vpx_config.h"
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
void vpx_subtract_block_neon(int rows, int cols,
 | 
			
		||||
                             int16_t *diff, ptrdiff_t diff_stride,
 | 
			
		||||
                             const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                             const uint8_t *pred, ptrdiff_t pred_stride) {
 | 
			
		||||
void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
 | 
			
		||||
                             ptrdiff_t diff_stride, const uint8_t *src,
 | 
			
		||||
                             ptrdiff_t src_stride, const uint8_t *pred,
 | 
			
		||||
                             ptrdiff_t pred_stride) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
 | 
			
		||||
  if (cols > 16) {
 | 
			
		||||
@@ -26,14 +26,14 @@ void vpx_subtract_block_neon(int rows, int cols,
 | 
			
		||||
        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
 | 
			
		||||
        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
 | 
			
		||||
        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
 | 
			
		||||
        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
 | 
			
		||||
                                                 vget_low_u8(v_pred_00));
 | 
			
		||||
        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
 | 
			
		||||
                                                 vget_high_u8(v_pred_00));
 | 
			
		||||
        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
 | 
			
		||||
                                                 vget_low_u8(v_pred_16));
 | 
			
		||||
        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
 | 
			
		||||
                                                 vget_high_u8(v_pred_16));
 | 
			
		||||
        const uint16x8_t v_diff_lo_00 =
 | 
			
		||||
            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
 | 
			
		||||
        const uint16x8_t v_diff_hi_00 =
 | 
			
		||||
            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
 | 
			
		||||
        const uint16x8_t v_diff_lo_16 =
 | 
			
		||||
            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
 | 
			
		||||
        const uint16x8_t v_diff_hi_16 =
 | 
			
		||||
            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
 | 
			
		||||
        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
 | 
			
		||||
        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
 | 
			
		||||
        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
 | 
			
		||||
@@ -47,10 +47,10 @@ void vpx_subtract_block_neon(int rows, int cols,
 | 
			
		||||
    for (r = 0; r < rows; ++r) {
 | 
			
		||||
      const uint8x16_t v_src = vld1q_u8(&src[0]);
 | 
			
		||||
      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
 | 
			
		||||
      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
 | 
			
		||||
                                            vget_low_u8(v_pred));
 | 
			
		||||
      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
 | 
			
		||||
                                            vget_high_u8(v_pred));
 | 
			
		||||
      const uint16x8_t v_diff_lo =
 | 
			
		||||
          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
 | 
			
		||||
      const uint16x8_t v_diff_hi =
 | 
			
		||||
          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
 | 
			
		||||
      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
 | 
			
		||||
      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
 | 
			
		||||
      diff += diff_stride;
 | 
			
		||||
@@ -69,8 +69,7 @@ void vpx_subtract_block_neon(int rows, int cols,
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    for (r = 0; r < rows; ++r) {
 | 
			
		||||
      for (c = 0; c < cols; ++c)
 | 
			
		||||
        diff[c] = src[c] - pred[c];
 | 
			
		||||
      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
 | 
			
		||||
 | 
			
		||||
      diff += diff_stride;
 | 
			
		||||
      pred += pred_stride;
 | 
			
		||||
 
 | 
			
		||||
@@ -32,9 +32,9 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// w * h must be less than 2048 or local variable v_sum may overflow.
 | 
			
		||||
static void variance_neon_w8(const uint8_t *a, int a_stride,
 | 
			
		||||
                             const uint8_t *b, int b_stride,
 | 
			
		||||
                             int w, int h, uint32_t *sse, int *sum) {
 | 
			
		||||
static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
 | 
			
		||||
                             int b_stride, int w, int h, uint32_t *sse,
 | 
			
		||||
                             int *sum) {
 | 
			
		||||
  int i, j;
 | 
			
		||||
  int16x8_t v_sum = vdupq_n_s16(0);
 | 
			
		||||
  int32x4_t v_sse_lo = vdupq_n_s32(0);
 | 
			
		||||
@@ -47,12 +47,10 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
 | 
			
		||||
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
 | 
			
		||||
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
 | 
			
		||||
      v_sum = vaddq_s16(v_sum, sv_diff);
 | 
			
		||||
      v_sse_lo = vmlal_s16(v_sse_lo,
 | 
			
		||||
                           vget_low_s16(sv_diff),
 | 
			
		||||
                           vget_low_s16(sv_diff));
 | 
			
		||||
      v_sse_hi = vmlal_s16(v_sse_hi,
 | 
			
		||||
                           vget_high_s16(sv_diff),
 | 
			
		||||
                           vget_high_s16(sv_diff));
 | 
			
		||||
      v_sse_lo =
 | 
			
		||||
          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
 | 
			
		||||
      v_sse_hi =
 | 
			
		||||
          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
 | 
			
		||||
    }
 | 
			
		||||
    a += a_stride;
 | 
			
		||||
    b += b_stride;
 | 
			
		||||
@@ -62,15 +60,13 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
 | 
			
		||||
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
 | 
			
		||||
                        const uint8_t *b, int b_stride,
 | 
			
		||||
                        unsigned int *sse, int *sum) {
 | 
			
		||||
void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
 | 
			
		||||
                        int b_stride, unsigned int *sse, int *sum) {
 | 
			
		||||
  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
 | 
			
		||||
                          const uint8_t *b, int b_stride,
 | 
			
		||||
                          unsigned int *sse, int *sum) {
 | 
			
		||||
void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
 | 
			
		||||
                          int b_stride, unsigned int *sse, int *sum) {
 | 
			
		||||
  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -104,9 +100,8 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
 | 
			
		||||
  int sum1, sum2;
 | 
			
		||||
  uint32_t sse1, sse2;
 | 
			
		||||
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
 | 
			
		||||
  variance_neon_w8(a + (32 * a_stride), a_stride,
 | 
			
		||||
                   b + (32 * b_stride), b_stride, 32, 32,
 | 
			
		||||
                   &sse2, &sum2);
 | 
			
		||||
  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
 | 
			
		||||
                   32, 32, &sse2, &sum2);
 | 
			
		||||
  *sse = sse1 + sse2;
 | 
			
		||||
  sum1 += sum2;
 | 
			
		||||
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 | 
			
		||||
@@ -118,9 +113,8 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
 | 
			
		||||
  int sum1, sum2;
 | 
			
		||||
  uint32_t sse1, sse2;
 | 
			
		||||
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
 | 
			
		||||
  variance_neon_w8(a + (16 * a_stride), a_stride,
 | 
			
		||||
                   b + (16 * b_stride), b_stride, 64, 16,
 | 
			
		||||
                   &sse2, &sum2);
 | 
			
		||||
  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
 | 
			
		||||
                   64, 16, &sse2, &sum2);
 | 
			
		||||
  *sse = sse1 + sse2;
 | 
			
		||||
  sum1 += sum2;
 | 
			
		||||
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 | 
			
		||||
@@ -133,32 +127,27 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
 | 
			
		||||
  uint32_t sse1, sse2;
 | 
			
		||||
 | 
			
		||||
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
 | 
			
		||||
  variance_neon_w8(a + (16 * a_stride), a_stride,
 | 
			
		||||
                   b + (16 * b_stride), b_stride, 64, 16,
 | 
			
		||||
                   &sse2, &sum2);
 | 
			
		||||
  sse1 += sse2;
 | 
			
		||||
  sum1 += sum2;
 | 
			
		||||
 | 
			
		||||
  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
 | 
			
		||||
                   b + (16 * 2 * b_stride), b_stride,
 | 
			
		||||
  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
 | 
			
		||||
                   64, 16, &sse2, &sum2);
 | 
			
		||||
  sse1 += sse2;
 | 
			
		||||
  sum1 += sum2;
 | 
			
		||||
 | 
			
		||||
  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
 | 
			
		||||
                   b + (16 * 3 * b_stride), b_stride,
 | 
			
		||||
                   64, 16, &sse2, &sum2);
 | 
			
		||||
  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
 | 
			
		||||
                   b_stride, 64, 16, &sse2, &sum2);
 | 
			
		||||
  sse1 += sse2;
 | 
			
		||||
  sum1 += sum2;
 | 
			
		||||
 | 
			
		||||
  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
 | 
			
		||||
                   b_stride, 64, 16, &sse2, &sum2);
 | 
			
		||||
  *sse = sse1 + sse2;
 | 
			
		||||
  sum1 += sum2;
 | 
			
		||||
  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_variance16x8_neon(
 | 
			
		||||
        const unsigned char *src_ptr,
 | 
			
		||||
unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
 | 
			
		||||
                                   int source_stride,
 | 
			
		||||
                                   const unsigned char *ref_ptr,
 | 
			
		||||
        int recon_stride,
 | 
			
		||||
        unsigned int *sse) {
 | 
			
		||||
                                   int recon_stride, unsigned int *sse) {
 | 
			
		||||
  int i;
 | 
			
		||||
  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
 | 
			
		||||
  uint32x2_t d0u32, d10u32;
 | 
			
		||||
@@ -222,8 +211,7 @@ unsigned int vpx_variance16x8_neon(
 | 
			
		||||
  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
 | 
			
		||||
  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
 | 
			
		||||
 | 
			
		||||
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
 | 
			
		||||
                      vreinterpret_s32_s64(d0s64));
 | 
			
		||||
  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
 | 
			
		||||
  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
 | 
			
		||||
 | 
			
		||||
  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
 | 
			
		||||
@@ -232,12 +220,10 @@ unsigned int vpx_variance16x8_neon(
 | 
			
		||||
  return vget_lane_u32(d0u32, 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_variance8x16_neon(
 | 
			
		||||
        const unsigned char *src_ptr,
 | 
			
		||||
unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
 | 
			
		||||
                                   int source_stride,
 | 
			
		||||
                                   const unsigned char *ref_ptr,
 | 
			
		||||
        int recon_stride,
 | 
			
		||||
        unsigned int *sse) {
 | 
			
		||||
                                   int recon_stride, unsigned int *sse) {
 | 
			
		||||
  int i;
 | 
			
		||||
  uint8x8_t d0u8, d2u8, d4u8, d6u8;
 | 
			
		||||
  int16x4_t d22s16, d23s16, d24s16, d25s16;
 | 
			
		||||
@@ -287,8 +273,7 @@ unsigned int vpx_variance8x16_neon(
 | 
			
		||||
  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
 | 
			
		||||
  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
 | 
			
		||||
 | 
			
		||||
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
 | 
			
		||||
                      vreinterpret_s32_s64(d0s64));
 | 
			
		||||
  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
 | 
			
		||||
  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
 | 
			
		||||
 | 
			
		||||
  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
 | 
			
		||||
@@ -297,11 +282,8 @@ unsigned int vpx_variance8x16_neon(
 | 
			
		||||
  return vget_lane_u32(d0u32, 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_mse16x16_neon(
 | 
			
		||||
        const unsigned char *src_ptr,
 | 
			
		||||
        int source_stride,
 | 
			
		||||
        const unsigned char *ref_ptr,
 | 
			
		||||
        int recon_stride,
 | 
			
		||||
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
 | 
			
		||||
                               const unsigned char *ref_ptr, int recon_stride,
 | 
			
		||||
                               unsigned int *sse) {
 | 
			
		||||
  int i;
 | 
			
		||||
  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
 | 
			
		||||
@@ -363,8 +345,7 @@ unsigned int vpx_mse16x16_neon(
 | 
			
		||||
  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int vpx_get4x4sse_cs_neon(
 | 
			
		||||
        const unsigned char *src_ptr,
 | 
			
		||||
unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
 | 
			
		||||
                                   int source_stride,
 | 
			
		||||
                                   const unsigned char *ref_ptr,
 | 
			
		||||
                                   int recon_stride) {
 | 
			
		||||
 
 | 
			
		||||
@@ -16,15 +16,10 @@
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
static INLINE int32x4_t MULTIPLY_BY_Q0(
 | 
			
		||||
    int16x4_t dsrc0,
 | 
			
		||||
    int16x4_t dsrc1,
 | 
			
		||||
    int16x4_t dsrc2,
 | 
			
		||||
    int16x4_t dsrc3,
 | 
			
		||||
    int16x4_t dsrc4,
 | 
			
		||||
    int16x4_t dsrc5,
 | 
			
		||||
    int16x4_t dsrc6,
 | 
			
		||||
    int16x4_t dsrc7,
 | 
			
		||||
static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
 | 
			
		||||
                                       int16x4_t dsrc2, int16x4_t dsrc3,
 | 
			
		||||
                                       int16x4_t dsrc4, int16x4_t dsrc5,
 | 
			
		||||
                                       int16x4_t dsrc6, int16x4_t dsrc7,
 | 
			
		||||
                                       int16x8_t q0s16) {
 | 
			
		||||
  int32x4_t qdst;
 | 
			
		||||
  int16x4_t d0s16, d1s16;
 | 
			
		||||
@@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
 | 
			
		||||
  return qdst;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
    const uint8_t *src,
 | 
			
		||||
    ptrdiff_t src_stride,
 | 
			
		||||
    uint8_t *dst,
 | 
			
		||||
    ptrdiff_t dst_stride,
 | 
			
		||||
    const int16_t *filter_x,
 | 
			
		||||
    int x_step_q4,
 | 
			
		||||
void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                  uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                  const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                                  const int16_t *filter_y,  // unused
 | 
			
		||||
                                  int y_step_q4,            // unused
 | 
			
		||||
    int w,
 | 
			
		||||
    int h) {
 | 
			
		||||
                                  int w, int h) {
 | 
			
		||||
  int width;
 | 
			
		||||
  const uint8_t *s;
 | 
			
		||||
  uint8_t *d;
 | 
			
		||||
@@ -90,8 +80,8 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
    q12u8 = vcombine_u8(d24u8, d25u8);
 | 
			
		||||
    q13u8 = vcombine_u8(d26u8, d27u8);
 | 
			
		||||
 | 
			
		||||
    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
 | 
			
		||||
                        vreinterpretq_u16_u8(q13u8));
 | 
			
		||||
    q0x2u16 =
 | 
			
		||||
        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
 | 
			
		||||
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
 | 
			
		||||
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
 | 
			
		||||
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
 | 
			
		||||
@@ -117,9 +107,7 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
 | 
			
		||||
    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
 | 
			
		||||
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
 | 
			
		||||
    for (width = w;
 | 
			
		||||
         width > 0;
 | 
			
		||||
         width -= 4, src += 4, dst += 4) {  // loop_horiz
 | 
			
		||||
    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
 | 
			
		||||
      s = src;
 | 
			
		||||
      d28u32 = vld1_dup_u32((const uint32_t *)s);
 | 
			
		||||
      s += src_stride;
 | 
			
		||||
@@ -131,10 +119,10 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
 | 
			
		||||
      __builtin_prefetch(src + 64);
 | 
			
		||||
 | 
			
		||||
      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
 | 
			
		||||
                         vreinterpret_u16_u32(d31u32));
 | 
			
		||||
      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
 | 
			
		||||
                         vreinterpret_u16_u32(d30u32));
 | 
			
		||||
      d0x2u16 =
 | 
			
		||||
          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
 | 
			
		||||
      d1x2u16 =
 | 
			
		||||
          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
 | 
			
		||||
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
 | 
			
		||||
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
 | 
			
		||||
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
 | 
			
		||||
@@ -144,8 +132,8 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
 | 
			
		||||
      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
 | 
			
		||||
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
 | 
			
		||||
      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
 | 
			
		||||
                          vreinterpretq_u32_u8(q15u8));
 | 
			
		||||
      q0x2u32 =
 | 
			
		||||
          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
 | 
			
		||||
 | 
			
		||||
      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
 | 
			
		||||
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
 | 
			
		||||
@@ -173,14 +161,14 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
 | 
			
		||||
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
 | 
			
		||||
 | 
			
		||||
      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
 | 
			
		||||
                              d18s16, d19s16, d23s16, d24s16, q0s16);
 | 
			
		||||
      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
 | 
			
		||||
                              d19s16, d23s16, d24s16, d26s16, q0s16);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
 | 
			
		||||
                              d23s16, d24s16, d26s16, d27s16, q0s16);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
 | 
			
		||||
                              d24s16, d26s16, d27s16, d25s16, q0s16);
 | 
			
		||||
      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
 | 
			
		||||
                             d23s16, d24s16, q0s16);
 | 
			
		||||
      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
 | 
			
		||||
                             d24s16, d26s16, q0s16);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
 | 
			
		||||
                              d26s16, d27s16, q0s16);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
 | 
			
		||||
                              d27s16, d25s16, q0s16);
 | 
			
		||||
 | 
			
		||||
      __builtin_prefetch(src + 64 + src_stride * 3);
 | 
			
		||||
 | 
			
		||||
@@ -195,8 +183,7 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
      d2u8 = vqmovn_u16(q1u16);
 | 
			
		||||
      d3u8 = vqmovn_u16(q2u16);
 | 
			
		||||
 | 
			
		||||
      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
 | 
			
		||||
                         vreinterpret_u16_u8(d3u8));
 | 
			
		||||
      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
 | 
			
		||||
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
 | 
			
		||||
                         vreinterpret_u32_u16(d0x2u16.val[1]));
 | 
			
		||||
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
 | 
			
		||||
@@ -231,16 +218,11 @@ void vpx_convolve8_avg_horiz_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_avg_vert_neon(
 | 
			
		||||
    const uint8_t *src,
 | 
			
		||||
    ptrdiff_t src_stride,
 | 
			
		||||
    uint8_t *dst,
 | 
			
		||||
    ptrdiff_t dst_stride,
 | 
			
		||||
void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                 uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                 const int16_t *filter_x,  // unused
 | 
			
		||||
                                 int x_step_q4,            // unused
 | 
			
		||||
    const int16_t *filter_y,
 | 
			
		||||
    int y_step_q4,
 | 
			
		||||
    int w,
 | 
			
		||||
                                 const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                                 int h) {
 | 
			
		||||
  int height;
 | 
			
		||||
  const uint8_t *s;
 | 
			
		||||
@@ -319,20 +301,20 @@ void vpx_convolve8_avg_vert_neon(
 | 
			
		||||
 | 
			
		||||
      __builtin_prefetch(s);
 | 
			
		||||
      __builtin_prefetch(s + src_stride);
 | 
			
		||||
      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
 | 
			
		||||
                              d20s16, d21s16, d22s16, d24s16, q0s16);
 | 
			
		||||
      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
 | 
			
		||||
                             d22s16, d24s16, q0s16);
 | 
			
		||||
      __builtin_prefetch(s + src_stride * 2);
 | 
			
		||||
      __builtin_prefetch(s + src_stride * 3);
 | 
			
		||||
      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
 | 
			
		||||
                              d21s16, d22s16, d24s16, d26s16, q0s16);
 | 
			
		||||
      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
 | 
			
		||||
                             d24s16, d26s16, q0s16);
 | 
			
		||||
      __builtin_prefetch(d);
 | 
			
		||||
      __builtin_prefetch(d + dst_stride);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
 | 
			
		||||
                              d22s16, d24s16, d26s16, d27s16, q0s16);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
 | 
			
		||||
                              d26s16, d27s16, q0s16);
 | 
			
		||||
      __builtin_prefetch(d + dst_stride * 2);
 | 
			
		||||
      __builtin_prefetch(d + dst_stride * 3);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
 | 
			
		||||
                              d24s16, d26s16, d27s16, d25s16, q0s16);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
 | 
			
		||||
                              d27s16, d25s16, q0s16);
 | 
			
		||||
 | 
			
		||||
      d2u16 = vqrshrun_n_s32(q1s32, 7);
 | 
			
		||||
      d3u16 = vqrshrun_n_s32(q2s32, 7);
 | 
			
		||||
 
 | 
			
		||||
@@ -16,15 +16,10 @@
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
static INLINE int32x4_t MULTIPLY_BY_Q0(
 | 
			
		||||
    int16x4_t dsrc0,
 | 
			
		||||
    int16x4_t dsrc1,
 | 
			
		||||
    int16x4_t dsrc2,
 | 
			
		||||
    int16x4_t dsrc3,
 | 
			
		||||
    int16x4_t dsrc4,
 | 
			
		||||
    int16x4_t dsrc5,
 | 
			
		||||
    int16x4_t dsrc6,
 | 
			
		||||
    int16x4_t dsrc7,
 | 
			
		||||
static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
 | 
			
		||||
                                       int16x4_t dsrc2, int16x4_t dsrc3,
 | 
			
		||||
                                       int16x4_t dsrc4, int16x4_t dsrc5,
 | 
			
		||||
                                       int16x4_t dsrc6, int16x4_t dsrc7,
 | 
			
		||||
                                       int16x8_t q0s16) {
 | 
			
		||||
  int32x4_t qdst;
 | 
			
		||||
  int16x4_t d0s16, d1s16;
 | 
			
		||||
@@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
 | 
			
		||||
  return qdst;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_horiz_neon(
 | 
			
		||||
    const uint8_t *src,
 | 
			
		||||
    ptrdiff_t src_stride,
 | 
			
		||||
    uint8_t *dst,
 | 
			
		||||
    ptrdiff_t dst_stride,
 | 
			
		||||
    const int16_t *filter_x,
 | 
			
		||||
    int x_step_q4,
 | 
			
		||||
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                              uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                              const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                              const int16_t *filter_y,  // unused
 | 
			
		||||
                              int y_step_q4,            // unused
 | 
			
		||||
    int w,
 | 
			
		||||
    int h) {
 | 
			
		||||
                              int w, int h) {
 | 
			
		||||
  int width;
 | 
			
		||||
  const uint8_t *s, *psrc;
 | 
			
		||||
  uint8_t *d, *pdst;
 | 
			
		||||
@@ -77,8 +67,7 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
  q0s16 = vld1q_s16(filter_x);
 | 
			
		||||
 | 
			
		||||
  src -= 3;  // adjust for taps
 | 
			
		||||
  for (; h > 0; h -= 4,
 | 
			
		||||
    src += src_stride * 4,
 | 
			
		||||
  for (; h > 0; h -= 4, src += src_stride * 4,
 | 
			
		||||
                dst += dst_stride * 4) {  // loop_horiz_v
 | 
			
		||||
    s = src;
 | 
			
		||||
    d24u8 = vld1_u8(s);
 | 
			
		||||
@@ -92,8 +81,8 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
    q12u8 = vcombine_u8(d24u8, d25u8);
 | 
			
		||||
    q13u8 = vcombine_u8(d26u8, d27u8);
 | 
			
		||||
 | 
			
		||||
    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
 | 
			
		||||
                        vreinterpretq_u16_u8(q13u8));
 | 
			
		||||
    q0x2u16 =
 | 
			
		||||
        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
 | 
			
		||||
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
 | 
			
		||||
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
 | 
			
		||||
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
 | 
			
		||||
@@ -119,8 +108,7 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
 | 
			
		||||
    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
 | 
			
		||||
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
 | 
			
		||||
    for (width = w, psrc = src + 7, pdst = dst;
 | 
			
		||||
         width > 0;
 | 
			
		||||
    for (width = w, psrc = src + 7, pdst = dst; width > 0;
 | 
			
		||||
         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
 | 
			
		||||
      s = psrc;
 | 
			
		||||
      d28u32 = vld1_dup_u32((const uint32_t *)s);
 | 
			
		||||
@@ -133,10 +121,10 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
 | 
			
		||||
      __builtin_prefetch(psrc + 64);
 | 
			
		||||
 | 
			
		||||
      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
 | 
			
		||||
                         vreinterpret_u16_u32(d31u32));
 | 
			
		||||
      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
 | 
			
		||||
                         vreinterpret_u16_u32(d30u32));
 | 
			
		||||
      d0x2u16 =
 | 
			
		||||
          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
 | 
			
		||||
      d1x2u16 =
 | 
			
		||||
          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
 | 
			
		||||
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
 | 
			
		||||
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
 | 
			
		||||
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
 | 
			
		||||
@@ -146,8 +134,8 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
 | 
			
		||||
      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
 | 
			
		||||
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
 | 
			
		||||
      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
 | 
			
		||||
                          vreinterpretq_u32_u8(q15u8));
 | 
			
		||||
      q0x2u32 =
 | 
			
		||||
          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
 | 
			
		||||
 | 
			
		||||
      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
 | 
			
		||||
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
 | 
			
		||||
@@ -166,14 +154,14 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
 | 
			
		||||
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
 | 
			
		||||
 | 
			
		||||
      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
 | 
			
		||||
                              d18s16, d19s16, d23s16, d24s16, q0s16);
 | 
			
		||||
      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
 | 
			
		||||
                              d19s16, d23s16, d24s16, d26s16, q0s16);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
 | 
			
		||||
                              d23s16, d24s16, d26s16, d27s16, q0s16);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
 | 
			
		||||
                              d24s16, d26s16, d27s16, d25s16, q0s16);
 | 
			
		||||
      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
 | 
			
		||||
                             d23s16, d24s16, q0s16);
 | 
			
		||||
      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
 | 
			
		||||
                             d24s16, d26s16, q0s16);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
 | 
			
		||||
                              d26s16, d27s16, q0s16);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
 | 
			
		||||
                              d27s16, d25s16, q0s16);
 | 
			
		||||
 | 
			
		||||
      __builtin_prefetch(psrc + 60 + src_stride * 3);
 | 
			
		||||
 | 
			
		||||
@@ -188,8 +176,7 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
      d2u8 = vqmovn_u16(q1u16);
 | 
			
		||||
      d3u8 = vqmovn_u16(q2u16);
 | 
			
		||||
 | 
			
		||||
      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
 | 
			
		||||
                         vreinterpret_u16_u8(d3u8));
 | 
			
		||||
      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
 | 
			
		||||
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
 | 
			
		||||
                         vreinterpret_u32_u16(d0x2u16.val[1]));
 | 
			
		||||
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
 | 
			
		||||
@@ -217,16 +204,11 @@ void vpx_convolve8_horiz_neon(
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_vert_neon(
 | 
			
		||||
    const uint8_t *src,
 | 
			
		||||
    ptrdiff_t src_stride,
 | 
			
		||||
    uint8_t *dst,
 | 
			
		||||
    ptrdiff_t dst_stride,
 | 
			
		||||
void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                             uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                             const int16_t *filter_x,  // unused
 | 
			
		||||
                             int x_step_q4,            // unused
 | 
			
		||||
    const int16_t *filter_y,
 | 
			
		||||
    int y_step_q4,
 | 
			
		||||
    int w,
 | 
			
		||||
                             const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                             int h) {
 | 
			
		||||
  int height;
 | 
			
		||||
  const uint8_t *s;
 | 
			
		||||
@@ -294,20 +276,20 @@ void vpx_convolve8_vert_neon(
 | 
			
		||||
 | 
			
		||||
      __builtin_prefetch(d);
 | 
			
		||||
      __builtin_prefetch(d + dst_stride);
 | 
			
		||||
      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
 | 
			
		||||
                              d20s16, d21s16, d22s16, d24s16, q0s16);
 | 
			
		||||
      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
 | 
			
		||||
                             d22s16, d24s16, q0s16);
 | 
			
		||||
      __builtin_prefetch(d + dst_stride * 2);
 | 
			
		||||
      __builtin_prefetch(d + dst_stride * 3);
 | 
			
		||||
      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
 | 
			
		||||
                              d21s16, d22s16, d24s16, d26s16, q0s16);
 | 
			
		||||
      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
 | 
			
		||||
                             d24s16, d26s16, q0s16);
 | 
			
		||||
      __builtin_prefetch(s);
 | 
			
		||||
      __builtin_prefetch(s + src_stride);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
 | 
			
		||||
                              d22s16, d24s16, d26s16, d27s16, q0s16);
 | 
			
		||||
      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
 | 
			
		||||
                              d26s16, d27s16, q0s16);
 | 
			
		||||
      __builtin_prefetch(s + src_stride * 2);
 | 
			
		||||
      __builtin_prefetch(s + src_stride * 3);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
 | 
			
		||||
                              d24s16, d26s16, d27s16, d25s16, q0s16);
 | 
			
		||||
      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
 | 
			
		||||
                              d27s16, d25s16, q0s16);
 | 
			
		||||
 | 
			
		||||
      d2u16 = vqrshrun_n_s32(q1s32, 7);
 | 
			
		||||
      d3u16 = vqrshrun_n_s32(q2s32, 7);
 | 
			
		||||
 
 | 
			
		||||
@@ -13,23 +13,21 @@
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
void vpx_convolve_avg_neon(
 | 
			
		||||
    const uint8_t *src,    // r0
 | 
			
		||||
void vpx_convolve_avg_neon(const uint8_t *src,    // r0
 | 
			
		||||
                           ptrdiff_t src_stride,  // r1
 | 
			
		||||
                           uint8_t *dst,          // r2
 | 
			
		||||
                           ptrdiff_t dst_stride,  // r3
 | 
			
		||||
    const int16_t *filter_x,
 | 
			
		||||
    int filter_x_stride,
 | 
			
		||||
    const int16_t *filter_y,
 | 
			
		||||
    int filter_y_stride,
 | 
			
		||||
    int w,
 | 
			
		||||
                           const int16_t *filter_x, int filter_x_stride,
 | 
			
		||||
                           const int16_t *filter_y, int filter_y_stride, int w,
 | 
			
		||||
                           int h) {
 | 
			
		||||
  uint8_t *d;
 | 
			
		||||
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
 | 
			
		||||
  uint32x2_t d0u32, d2u32;
 | 
			
		||||
  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
 | 
			
		||||
  (void)filter_x;  (void)filter_x_stride;
 | 
			
		||||
  (void)filter_y;  (void)filter_y_stride;
 | 
			
		||||
  (void)filter_x;
 | 
			
		||||
  (void)filter_x_stride;
 | 
			
		||||
  (void)filter_y;
 | 
			
		||||
  (void)filter_y_stride;
 | 
			
		||||
 | 
			
		||||
  d = dst;
 | 
			
		||||
  if (w > 32) {  // avg64
 | 
			
		||||
@@ -133,8 +131,7 @@ void vpx_convolve_avg_neon(
 | 
			
		||||
      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
 | 
			
		||||
      d += dst_stride;
 | 
			
		||||
 | 
			
		||||
      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
 | 
			
		||||
                       vreinterpret_u8_u32(d2u32));
 | 
			
		||||
      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
 | 
			
		||||
 | 
			
		||||
      d0u32 = vreinterpret_u32_u8(d0u8);
 | 
			
		||||
      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
 | 
			
		||||
 
 | 
			
		||||
@@ -13,21 +13,19 @@
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
void vpx_convolve_copy_neon(
 | 
			
		||||
    const uint8_t *src,    // r0
 | 
			
		||||
void vpx_convolve_copy_neon(const uint8_t *src,    // r0
 | 
			
		||||
                            ptrdiff_t src_stride,  // r1
 | 
			
		||||
                            uint8_t *dst,          // r2
 | 
			
		||||
                            ptrdiff_t dst_stride,  // r3
 | 
			
		||||
    const int16_t *filter_x,
 | 
			
		||||
    int filter_x_stride,
 | 
			
		||||
    const int16_t *filter_y,
 | 
			
		||||
    int filter_y_stride,
 | 
			
		||||
    int w,
 | 
			
		||||
                            const int16_t *filter_x, int filter_x_stride,
 | 
			
		||||
                            const int16_t *filter_y, int filter_y_stride, int w,
 | 
			
		||||
                            int h) {
 | 
			
		||||
  uint8x8_t d0u8, d2u8;
 | 
			
		||||
  uint8x16_t q0u8, q1u8, q2u8, q3u8;
 | 
			
		||||
  (void)filter_x;  (void)filter_x_stride;
 | 
			
		||||
  (void)filter_y;  (void)filter_y_stride;
 | 
			
		||||
  (void)filter_x;
 | 
			
		||||
  (void)filter_x_stride;
 | 
			
		||||
  (void)filter_y;
 | 
			
		||||
  (void)filter_y_stride;
 | 
			
		||||
 | 
			
		||||
  if (w > 32) {  // copy64
 | 
			
		||||
    for (; h > 0; h--) {
 | 
			
		||||
 
 | 
			
		||||
@@ -14,10 +14,9 @@
 | 
			
		||||
#include "vpx_dsp/vpx_dsp_common.h"
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                        uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                        const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                        const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 | 
			
		||||
                        ptrdiff_t dst_stride, const int16_t *filter_x,
 | 
			
		||||
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                        int w, int h) {
 | 
			
		||||
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
 | 
			
		||||
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
 | 
			
		||||
@@ -35,23 +34,20 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
   * the temp buffer which has lots of extra room and is subsequently discarded
 | 
			
		||||
   * this is safe if somewhat less than ideal.
 | 
			
		||||
   */
 | 
			
		||||
  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
 | 
			
		||||
                           temp, 64,
 | 
			
		||||
                           filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                           w, intermediate_height);
 | 
			
		||||
  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
 | 
			
		||||
                           x_step_q4, filter_y, y_step_q4, w,
 | 
			
		||||
                           intermediate_height);
 | 
			
		||||
 | 
			
		||||
  /* Step into the temp buffer 3 lines to get the actual frame data */
 | 
			
		||||
  vpx_convolve8_vert_neon(temp + 64 * 3, 64,
 | 
			
		||||
                          dst, dst_stride,
 | 
			
		||||
                          filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                          w, h);
 | 
			
		||||
  vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
 | 
			
		||||
                          x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                            uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                            const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                            const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                            int w, int h) {
 | 
			
		||||
                            const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                            int h) {
 | 
			
		||||
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
 | 
			
		||||
  int intermediate_height = h + 7;
 | 
			
		||||
 | 
			
		||||
@@ -61,12 +57,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  /* This implementation has the same issues as above. In addition, we only want
 | 
			
		||||
   * to average the values after both passes.
 | 
			
		||||
   */
 | 
			
		||||
  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
 | 
			
		||||
                           temp, 64,
 | 
			
		||||
                           filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                           w, intermediate_height);
 | 
			
		||||
  vpx_convolve8_avg_vert_neon(temp + 64 * 3,
 | 
			
		||||
                              64, dst, dst_stride,
 | 
			
		||||
                              filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                              w, h);
 | 
			
		||||
  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
 | 
			
		||||
                           x_step_q4, filter_y, y_step_q4, w,
 | 
			
		||||
                           intermediate_height);
 | 
			
		||||
  vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
 | 
			
		||||
                              x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -16,7 +16,8 @@ unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
 | 
			
		||||
  int i, j;
 | 
			
		||||
  int sum = 0;
 | 
			
		||||
  for (i = 0; i < 8; ++i, s += p)
 | 
			
		||||
    for (j = 0; j < 8; sum += s[j], ++j) {}
 | 
			
		||||
    for (j = 0; j < 8; sum += s[j], ++j) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return (sum + 32) >> 6;
 | 
			
		||||
}
 | 
			
		||||
@@ -25,7 +26,8 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
 | 
			
		||||
  int i, j;
 | 
			
		||||
  int sum = 0;
 | 
			
		||||
  for (i = 0; i < 4; ++i, s += p)
 | 
			
		||||
    for (j = 0; j < 4; sum += s[j], ++j) {}
 | 
			
		||||
    for (j = 0; j < 4; sum += s[j], ++j) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return (sum + 8) >> 4;
 | 
			
		||||
}
 | 
			
		||||
@@ -92,8 +94,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 | 
			
		||||
  int idx;
 | 
			
		||||
  for (idx = 0; idx < 4; ++idx) {
 | 
			
		||||
    // src_diff: 9 bit, dynamic range [-255, 255]
 | 
			
		||||
    const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
 | 
			
		||||
                                + (idx & 0x01) * 8;
 | 
			
		||||
    const int16_t *src_ptr =
 | 
			
		||||
        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
 | 
			
		||||
    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -123,8 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 | 
			
		||||
int vpx_satd_c(const int16_t *coeff, int length) {
 | 
			
		||||
  int i;
 | 
			
		||||
  int satd = 0;
 | 
			
		||||
  for (i = 0; i < length; ++i)
 | 
			
		||||
    satd += abs(coeff[i]);
 | 
			
		||||
  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
 | 
			
		||||
 | 
			
		||||
  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
 | 
			
		||||
  return satd;
 | 
			
		||||
@@ -140,8 +141,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
 | 
			
		||||
    int i;
 | 
			
		||||
    hbuf[idx] = 0;
 | 
			
		||||
    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
 | 
			
		||||
    for (i = 0; i < height; ++i)
 | 
			
		||||
      hbuf[idx] += ref[i * ref_stride];
 | 
			
		||||
    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
 | 
			
		||||
    // hbuf[idx]: 9 bit, dynamic range [0, 510].
 | 
			
		||||
    hbuf[idx] /= norm_factor;
 | 
			
		||||
    ++ref;
 | 
			
		||||
@@ -153,16 +153,14 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
 | 
			
		||||
  int idx;
 | 
			
		||||
  int16_t sum = 0;
 | 
			
		||||
  // sum: 14 bit, dynamic range [0, 16320]
 | 
			
		||||
  for (idx = 0; idx < width; ++idx)
 | 
			
		||||
    sum += ref[idx];
 | 
			
		||||
  for (idx = 0; idx < width; ++idx) sum += ref[idx];
 | 
			
		||||
  return sum;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// ref: [0 - 510]
 | 
			
		||||
// src: [0 - 510]
 | 
			
		||||
// bwl: {2, 3, 4}
 | 
			
		||||
int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
 | 
			
		||||
                     const int bwl) {
 | 
			
		||||
int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
 | 
			
		||||
  int i;
 | 
			
		||||
  int width = 4 << bwl;
 | 
			
		||||
  int sse = 0, mean = 0, var;
 | 
			
		||||
@@ -198,7 +196,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
 | 
			
		||||
  int sum = 0;
 | 
			
		||||
  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
 | 
			
		||||
  for (i = 0; i < 8; ++i, s += p)
 | 
			
		||||
    for (j = 0; j < 8; sum += s[j], ++j) {}
 | 
			
		||||
    for (j = 0; j < 8; sum += s[j], ++j) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return (sum + 32) >> 6;
 | 
			
		||||
}
 | 
			
		||||
@@ -208,7 +207,8 @@ unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
 | 
			
		||||
  int sum = 0;
 | 
			
		||||
  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
 | 
			
		||||
  for (i = 0; i < 4; ++i, s += p)
 | 
			
		||||
    for (j = 0; j < 4; sum += s[j], ++j) {}
 | 
			
		||||
    for (j = 0; j < 4; sum += s[j], ++j) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  return (sum + 8) >> 4;
 | 
			
		||||
}
 | 
			
		||||
@@ -229,5 +229,3 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif  // CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -18,11 +18,8 @@
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
#include "vpx_util/endian_inl.h"
 | 
			
		||||
 | 
			
		||||
int vpx_reader_init(vpx_reader *r,
 | 
			
		||||
                    const uint8_t *buffer,
 | 
			
		||||
                    size_t size,
 | 
			
		||||
                    vpx_decrypt_cb decrypt_cb,
 | 
			
		||||
                    void *decrypt_state) {
 | 
			
		||||
int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
 | 
			
		||||
                    vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
 | 
			
		||||
  if (size && !buffer) {
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else {
 | 
			
		||||
 
 | 
			
		||||
@@ -45,11 +45,8 @@ typedef struct {
 | 
			
		||||
  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
 | 
			
		||||
} vpx_reader;
 | 
			
		||||
 | 
			
		||||
int vpx_reader_init(vpx_reader *r,
 | 
			
		||||
                    const uint8_t *buffer,
 | 
			
		||||
                    size_t size,
 | 
			
		||||
                    vpx_decrypt_cb decrypt_cb,
 | 
			
		||||
                    void *decrypt_state);
 | 
			
		||||
int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
 | 
			
		||||
                    vpx_decrypt_cb decrypt_cb, void *decrypt_state);
 | 
			
		||||
 | 
			
		||||
void vpx_reader_fill(vpx_reader *r);
 | 
			
		||||
 | 
			
		||||
@@ -81,8 +78,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
 | 
			
		||||
  unsigned int range;
 | 
			
		||||
  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
 | 
			
		||||
 | 
			
		||||
  if (r->count < 0)
 | 
			
		||||
    vpx_reader_fill(r);
 | 
			
		||||
  if (r->count < 0) vpx_reader_fill(r);
 | 
			
		||||
 | 
			
		||||
  value = r->value;
 | 
			
		||||
  count = r->count;
 | 
			
		||||
@@ -117,8 +113,7 @@ static INLINE int vpx_read_bit(vpx_reader *r) {
 | 
			
		||||
static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
 | 
			
		||||
  int literal = 0, bit;
 | 
			
		||||
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--)
 | 
			
		||||
    literal |= vpx_read_bit(r) << bit;
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit;
 | 
			
		||||
 | 
			
		||||
  return literal;
 | 
			
		||||
}
 | 
			
		||||
@@ -127,8 +122,7 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
 | 
			
		||||
                                const vpx_prob *probs) {
 | 
			
		||||
  vpx_tree_index i = 0;
 | 
			
		||||
 | 
			
		||||
  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
 | 
			
		||||
    continue;
 | 
			
		||||
  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue;
 | 
			
		||||
 | 
			
		||||
  return -i;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -30,19 +30,16 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
 | 
			
		||||
 | 
			
		||||
int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
 | 
			
		||||
  int value = 0, bit;
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--)
 | 
			
		||||
    value |= vpx_rb_read_bit(rb) << bit;
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit;
 | 
			
		||||
  return value;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
 | 
			
		||||
                               int bits) {
 | 
			
		||||
int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
 | 
			
		||||
  const int value = vpx_rb_read_literal(rb, bits);
 | 
			
		||||
  return vpx_rb_read_bit(rb) ? -value : value;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
 | 
			
		||||
                                   int bits) {
 | 
			
		||||
int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
 | 
			
		||||
#if CONFIG_MISC_FIXES
 | 
			
		||||
  const int nbits = sizeof(unsigned) * 8 - bits - 1;
 | 
			
		||||
  const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
 | 
			
		||||
 
 | 
			
		||||
@@ -24,11 +24,8 @@ void vpx_start_encode(vpx_writer *br, uint8_t *source) {
 | 
			
		||||
void vpx_stop_encode(vpx_writer *br) {
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < 32; i++)
 | 
			
		||||
    vpx_write_bit(br, 0);
 | 
			
		||||
  for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
 | 
			
		||||
 | 
			
		||||
  // Ensure there's no ambigous collision with any index marker bytes
 | 
			
		||||
  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
 | 
			
		||||
    br->buffer[br->pos++] = 0;
 | 
			
		||||
  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -85,8 +85,7 @@ static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
 | 
			
		||||
static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
 | 
			
		||||
  int bit;
 | 
			
		||||
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--)
 | 
			
		||||
    vpx_write_bit(w, 1 & (data >> bit));
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
 | 
			
		||||
 
 | 
			
		||||
@@ -33,12 +33,11 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
 | 
			
		||||
 | 
			
		||||
void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
 | 
			
		||||
  int bit;
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--)
 | 
			
		||||
    vpx_wb_write_bit(wb, (data >> bit) & 1);
 | 
			
		||||
  for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
 | 
			
		||||
                                     int data, int bits) {
 | 
			
		||||
void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
 | 
			
		||||
                                     int bits) {
 | 
			
		||||
#if CONFIG_MISC_FIXES
 | 
			
		||||
  vpx_wb_write_literal(wb, data, bits + 1);
 | 
			
		||||
#else
 | 
			
		||||
 
 | 
			
		||||
@@ -10,26 +10,32 @@
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include "vpx/vpx_integer.h"
 | 
			
		||||
 | 
			
		||||
const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
 | 
			
		||||
    14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
 | 
			
		||||
    8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
 | 
			
		||||
    13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
 | 
			
		||||
    8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
 | 
			
		||||
    4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
 | 
			
		||||
    4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
 | 
			
		||||
    10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
 | 
			
		||||
    5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
 | 
			
		||||
    11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
 | 
			
		||||
    10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
 | 
			
		||||
    8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
 | 
			
		||||
    2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
 | 
			
		||||
    1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
 | 
			
		||||
    7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
 | 
			
		||||
    5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
 | 
			
		||||
    0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
 | 
			
		||||
    10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
 | 
			
		||||
    4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
 | 
			
		||||
    13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
 | 
			
		||||
const int16_t vpx_rv[] = {
 | 
			
		||||
  8,  5,  2,  2,  8,  12, 4,  9,  8,  3,  0,  3,  9,  0,  0,  0,  8,  3,  14,
 | 
			
		||||
  4,  10, 1,  11, 14, 1,  14, 9,  6,  12, 11, 8,  6,  10, 0,  0,  8,  9,  0,
 | 
			
		||||
  3,  14, 8,  11, 13, 4,  2,  9,  0,  3,  9,  6,  1,  2,  3,  14, 13, 1,  8,
 | 
			
		||||
  2,  9,  7,  3,  3,  1,  13, 13, 6,  6,  5,  2,  7,  11, 9,  11, 8,  7,  3,
 | 
			
		||||
  2,  0,  13, 13, 14, 4,  12, 5,  12, 10, 8,  10, 13, 10, 4,  14, 4,  10, 0,
 | 
			
		||||
  8,  11, 1,  13, 7,  7,  14, 6,  14, 13, 2,  13, 5,  4,  4,  0,  10, 0,  5,
 | 
			
		||||
  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,  7,  2,  2,  5,  3,  4,  7,
 | 
			
		||||
  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,  0,  11, 8,  13, 1,  13, 1,
 | 
			
		||||
  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,  1,  13, 14, 7,  6,  7,  9,
 | 
			
		||||
  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,  8,  7,  10, 0,  8,  14, 11,
 | 
			
		||||
  3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12, 12, 8,  0,  11, 13, 1,  2,
 | 
			
		||||
  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,  3,  10, 5,  8,  0,  11, 6,
 | 
			
		||||
  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,  4,  3,  5,  6,  10, 8,  9,
 | 
			
		||||
  4,  11, 14, 0,  10, 0,  5,  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,
 | 
			
		||||
  7,  2,  2,  5,  3,  4,  7,  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,
 | 
			
		||||
  0,  11, 8,  13, 1,  13, 1,  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,
 | 
			
		||||
  1,  13, 14, 7,  6,  7,  9,  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,
 | 
			
		||||
  8,  7,  10, 0,  8,  14, 11, 3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12,
 | 
			
		||||
  12, 8,  0,  11, 13, 1,  2,  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,
 | 
			
		||||
  3,  10, 5,  8,  0,  11, 6,  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,
 | 
			
		||||
  4,  3,  5,  6,  10, 8,  9,  4,  11, 14, 3,  8,  3,  7,  8,  5,  11, 4,  12,
 | 
			
		||||
  3,  11, 9,  14, 8,  14, 13, 4,  3,  1,  2,  14, 6,  5,  4,  4,  11, 4,  6,
 | 
			
		||||
  2,  1,  5,  8,  8,  12, 13, 5,  14, 10, 12, 13, 0,  9,  5,  5,  11, 10, 13,
 | 
			
		||||
  9,  10, 13,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 | 
			
		||||
                                            unsigned char *dst_ptr,
 | 
			
		||||
@@ -55,8 +61,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 | 
			
		||||
 | 
			
		||||
      v = p_src[col];
 | 
			
		||||
 | 
			
		||||
      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
 | 
			
		||||
          && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
 | 
			
		||||
      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
 | 
			
		||||
          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
 | 
			
		||||
        unsigned char k1, k2, k3;
 | 
			
		||||
        k1 = (p_above2 + p_above1 + 1) >> 1;
 | 
			
		||||
        k2 = (p_below2 + p_below1 + 1) >> 1;
 | 
			
		||||
@@ -77,10 +83,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 | 
			
		||||
    for (col = 0; col < cols; col++) {
 | 
			
		||||
      v = p_src[col];
 | 
			
		||||
 | 
			
		||||
      if ((abs(v - p_src[col - 2]) < f[col])
 | 
			
		||||
          && (abs(v - p_src[col - 1]) < f[col])
 | 
			
		||||
          && (abs(v - p_src[col + 1]) < f[col])
 | 
			
		||||
          && (abs(v - p_src[col + 2]) < f[col])) {
 | 
			
		||||
      if ((abs(v - p_src[col - 2]) < f[col]) &&
 | 
			
		||||
          (abs(v - p_src[col - 1]) < f[col]) &&
 | 
			
		||||
          (abs(v - p_src[col + 1]) < f[col]) &&
 | 
			
		||||
          (abs(v - p_src[col + 2]) < f[col])) {
 | 
			
		||||
        unsigned char k1, k2, k3;
 | 
			
		||||
        k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
 | 
			
		||||
        k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
 | 
			
		||||
@@ -90,8 +96,7 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 | 
			
		||||
 | 
			
		||||
      d[col & 3] = v;
 | 
			
		||||
 | 
			
		||||
      if (col >= 2)
 | 
			
		||||
        p_dst[col - 2] = d[(col - 2) & 3];
 | 
			
		||||
      if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* handle the last two pixels */
 | 
			
		||||
@@ -115,14 +120,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
 | 
			
		||||
    int sumsq = 0;
 | 
			
		||||
    int sum = 0;
 | 
			
		||||
 | 
			
		||||
    for (i = -8; i < 0; i++)
 | 
			
		||||
      s[i] = s[0];
 | 
			
		||||
    for (i = -8; i < 0; i++) s[i] = s[0];
 | 
			
		||||
 | 
			
		||||
    /* 17 avoids valgrind warning - we buffer values in c in d
 | 
			
		||||
     * and only write them when we've read 8 ahead...
 | 
			
		||||
     */
 | 
			
		||||
    for (i = 0; i < 17; i++)
 | 
			
		||||
      s[i + cols] = s[cols - 1];
 | 
			
		||||
    for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
 | 
			
		||||
 | 
			
		||||
    for (i = -8; i <= 6; i++) {
 | 
			
		||||
      sumsq += s[i] * s[i];
 | 
			
		||||
@@ -162,14 +165,12 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
 | 
			
		||||
    unsigned char d[16];
 | 
			
		||||
    const int16_t *rv2 = rv3 + ((c * 17) & 127);
 | 
			
		||||
 | 
			
		||||
    for (i = -8; i < 0; i++)
 | 
			
		||||
      s[i * pitch] = s[0];
 | 
			
		||||
    for (i = -8; i < 0; i++) s[i * pitch] = s[0];
 | 
			
		||||
 | 
			
		||||
    /* 17 avoids valgrind warning - we buffer values in c in d
 | 
			
		||||
     * and only write them when we've read 8 ahead...
 | 
			
		||||
     */
 | 
			
		||||
    for (i = 0; i < 17; i++)
 | 
			
		||||
      s[(i + rows) * pitch] = s[(rows - 1) * pitch];
 | 
			
		||||
    for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
 | 
			
		||||
 | 
			
		||||
    for (i = -8; i <= 6; i++) {
 | 
			
		||||
      sumsq += s[i * pitch] * s[i * pitch];
 | 
			
		||||
@@ -184,16 +185,14 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
 | 
			
		||||
      if (sumsq * 15 - sum * sum < flimit) {
 | 
			
		||||
        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
 | 
			
		||||
      }
 | 
			
		||||
      if (r >= 8)
 | 
			
		||||
        s[-8 * pitch] = d[(r - 8) & 15];
 | 
			
		||||
      if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
 | 
			
		||||
      s += pitch;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if CONFIG_POSTPROC
 | 
			
		||||
static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
 | 
			
		||||
    int q) {
 | 
			
		||||
static void vpx_de_mblock(YV12_BUFFER_CONFIG *post, int q) {
 | 
			
		||||
  vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
 | 
			
		||||
                            post->y_width, q2mbl(q));
 | 
			
		||||
  vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
 | 
			
		||||
 
 | 
			
		||||
@@ -55,8 +55,8 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
 | 
			
		||||
  int l;
 | 
			
		||||
  lw = (_w + 1) >> 1;
 | 
			
		||||
  lh = (_h + 1) >> 1;
 | 
			
		||||
  data_size = _nlevels * sizeof(fs_level)
 | 
			
		||||
      + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
 | 
			
		||||
  data_size =
 | 
			
		||||
      _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
 | 
			
		||||
  for (l = 0; l < _nlevels; l++) {
 | 
			
		||||
    size_t im_size;
 | 
			
		||||
    size_t level_size;
 | 
			
		||||
@@ -97,9 +97,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
 | 
			
		||||
  _ctx->col_buf = (unsigned *)data;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void fs_ctx_clear(fs_ctx *_ctx) {
 | 
			
		||||
  free(_ctx->level);
 | 
			
		||||
}
 | 
			
		||||
static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
 | 
			
		||||
 | 
			
		||||
static void fs_downsample_level(fs_ctx *_ctx, int _l) {
 | 
			
		||||
  const uint32_t *src1;
 | 
			
		||||
@@ -130,18 +128,18 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
 | 
			
		||||
      int i1;
 | 
			
		||||
      i0 = 2 * i;
 | 
			
		||||
      i1 = FS_MINI(i0 + 1, w2);
 | 
			
		||||
      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
 | 
			
		||||
          + src1[j1offs + i0] + src1[j1offs + i1];
 | 
			
		||||
      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
 | 
			
		||||
          + src2[j1offs + i0] + src2[j1offs + i1];
 | 
			
		||||
      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
 | 
			
		||||
                        src1[j1offs + i0] + src1[j1offs + i1];
 | 
			
		||||
      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
 | 
			
		||||
                        src2[j1offs + i0] + src2[j1offs + i1];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
 | 
			
		||||
                                 int _s1ystride, const uint8_t *_src2,
 | 
			
		||||
                                 int _s2ystride, int _w, int _h,
 | 
			
		||||
                                 uint32_t bd, uint32_t shift) {
 | 
			
		||||
                                 int _s2ystride, int _w, int _h, uint32_t bd,
 | 
			
		||||
                                 uint32_t shift) {
 | 
			
		||||
  uint32_t *dst1;
 | 
			
		||||
  uint32_t *dst2;
 | 
			
		||||
  int w;
 | 
			
		||||
@@ -163,23 +161,23 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
 | 
			
		||||
      i0 = 2 * i;
 | 
			
		||||
      i1 = FS_MINI(i0 + 1, _w);
 | 
			
		||||
      if (bd == 8 && shift == 0) {
 | 
			
		||||
        dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
 | 
			
		||||
            + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
 | 
			
		||||
            + _src1[j1 * _s1ystride + i1];
 | 
			
		||||
        dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
 | 
			
		||||
            + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
 | 
			
		||||
            + _src2[j1 * _s2ystride + i1];
 | 
			
		||||
        dst1[j * w + i] =
 | 
			
		||||
            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
 | 
			
		||||
            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
 | 
			
		||||
        dst2[j * w + i] =
 | 
			
		||||
            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
 | 
			
		||||
            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
 | 
			
		||||
      } else {
 | 
			
		||||
        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
 | 
			
		||||
        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
 | 
			
		||||
        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift)
 | 
			
		||||
              + (src1s[j0 * _s1ystride + i1] >> shift)
 | 
			
		||||
              + (src1s[j1 * _s1ystride + i0] >> shift)
 | 
			
		||||
              + (src1s[j1 * _s1ystride + i1] >> shift);
 | 
			
		||||
        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift)
 | 
			
		||||
              + (src2s[j0 * _s2ystride + i1] >> shift)
 | 
			
		||||
              + (src2s[j1 * _s2ystride + i0] >> shift)
 | 
			
		||||
              + (src2s[j1 * _s2ystride + i1] >> shift);
 | 
			
		||||
        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
 | 
			
		||||
                          (src1s[j0 * _s1ystride + i1] >> shift) +
 | 
			
		||||
                          (src1s[j1 * _s1ystride + i0] >> shift) +
 | 
			
		||||
                          (src1s[j1 * _s1ystride + i1] >> shift);
 | 
			
		||||
        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
 | 
			
		||||
                          (src2s[j0 * _s2ystride + i1] >> shift) +
 | 
			
		||||
                          (src2s[j1 * _s2ystride + i0] >> shift) +
 | 
			
		||||
                          (src2s[j1 * _s2ystride + i1] >> shift);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -200,10 +198,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
  int j;
 | 
			
		||||
  double ssim_c1 = SSIM_C1;
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
  if (bit_depth == 10)
 | 
			
		||||
    ssim_c1 = SSIM_C1_10;
 | 
			
		||||
  if (bit_depth == 12)
 | 
			
		||||
    ssim_c1 = SSIM_C1_12;
 | 
			
		||||
  if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
 | 
			
		||||
  if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
 | 
			
		||||
#else
 | 
			
		||||
  assert(bit_depth == 8);
 | 
			
		||||
#endif
 | 
			
		||||
@@ -213,16 +209,12 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
  col_sums_y = col_sums_x + w;
 | 
			
		||||
  im1 = _ctx->level[_l].im1;
 | 
			
		||||
  im2 = _ctx->level[_l].im2;
 | 
			
		||||
  for (i = 0; i < w; i++)
 | 
			
		||||
    col_sums_x[i] = 5 * im1[i];
 | 
			
		||||
  for (i = 0; i < w; i++)
 | 
			
		||||
    col_sums_y[i] = 5 * im2[i];
 | 
			
		||||
  for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
 | 
			
		||||
  for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
 | 
			
		||||
  for (j = 1; j < 4; j++) {
 | 
			
		||||
    j1offs = FS_MINI(j, h - 1) * w;
 | 
			
		||||
    for (i = 0; i < w; i++)
 | 
			
		||||
      col_sums_x[i] += im1[j1offs + i];
 | 
			
		||||
    for (i = 0; i < w; i++)
 | 
			
		||||
      col_sums_y[i] += im2[j1offs + i];
 | 
			
		||||
    for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
 | 
			
		||||
    for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
 | 
			
		||||
  }
 | 
			
		||||
  ssim = _ctx->level[_l].ssim;
 | 
			
		||||
  c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
 | 
			
		||||
@@ -239,8 +231,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
      muy += col_sums_y[i1];
 | 
			
		||||
    }
 | 
			
		||||
    for (i = 0; i < w; i++) {
 | 
			
		||||
      ssim[j * w + i] *= (2 * mux * (double) muy + c1)
 | 
			
		||||
          / (mux * (double) mux + muy * (double) muy + c1);
 | 
			
		||||
      ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
 | 
			
		||||
                         (mux * (double)mux + muy * (double)muy + c1);
 | 
			
		||||
      if (i + 1 < w) {
 | 
			
		||||
        i0 = FS_MAXI(0, i - 4);
 | 
			
		||||
        i1 = FS_MINI(i + 4, w - 1);
 | 
			
		||||
@@ -250,15 +242,11 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
    }
 | 
			
		||||
    if (j + 1 < h) {
 | 
			
		||||
      j0offs = FS_MAXI(0, j - 4) * w;
 | 
			
		||||
      for (i = 0; i < w; i++)
 | 
			
		||||
        col_sums_x[i] -= im1[j0offs + i];
 | 
			
		||||
      for (i = 0; i < w; i++)
 | 
			
		||||
        col_sums_y[i] -= im2[j0offs + i];
 | 
			
		||||
      for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
 | 
			
		||||
      for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
 | 
			
		||||
      j1offs = FS_MINI(j + 4, h - 1) * w;
 | 
			
		||||
      for (i = 0; i < w; i++)
 | 
			
		||||
        col_sums_x[i] += im1[j1offs + i];
 | 
			
		||||
      for (i = 0; i < w; i++)
 | 
			
		||||
        col_sums_y[i] += im2[j1offs + i];
 | 
			
		||||
      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
 | 
			
		||||
      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -272,8 +260,7 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
    col_sums_gx2[(_col)] = gx * (double)gx;                    \
 | 
			
		||||
    col_sums_gy2[(_col)] = gy * (double)gy;                    \
 | 
			
		||||
    col_sums_gxgy[(_col)] = gx * (double)gy;                   \
 | 
			
		||||
  } \
 | 
			
		||||
  while (0)
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
#define FS_COL_ADD(_col, _joffs, _ioffs)                       \
 | 
			
		||||
  do {                                                         \
 | 
			
		||||
@@ -284,8 +271,7 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
    col_sums_gx2[(_col)] += gx * (double)gx;                   \
 | 
			
		||||
    col_sums_gy2[(_col)] += gy * (double)gy;                   \
 | 
			
		||||
    col_sums_gxgy[(_col)] += gx * (double)gy;                  \
 | 
			
		||||
  } \
 | 
			
		||||
  while (0)
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
#define FS_COL_SUB(_col, _joffs, _ioffs)                       \
 | 
			
		||||
  do {                                                         \
 | 
			
		||||
@@ -296,32 +282,28 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
    col_sums_gx2[(_col)] -= gx * (double)gx;                   \
 | 
			
		||||
    col_sums_gy2[(_col)] -= gy * (double)gy;                   \
 | 
			
		||||
    col_sums_gxgy[(_col)] -= gx * (double)gy;                  \
 | 
			
		||||
  } \
 | 
			
		||||
  while (0)
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
#define FS_COL_COPY(_col1, _col2)                    \
 | 
			
		||||
  do {                                               \
 | 
			
		||||
    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)];   \
 | 
			
		||||
    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)];   \
 | 
			
		||||
    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
 | 
			
		||||
  } \
 | 
			
		||||
  while (0)
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
#define FS_COL_HALVE(_col1, _col2)                         \
 | 
			
		||||
  do {                                                     \
 | 
			
		||||
    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5;   \
 | 
			
		||||
    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5;   \
 | 
			
		||||
    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
 | 
			
		||||
  } \
 | 
			
		||||
  while (0)
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
#define FS_COL_DOUBLE(_col1, _col2)                      \
 | 
			
		||||
  do {                                                   \
 | 
			
		||||
    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2;   \
 | 
			
		||||
    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2;   \
 | 
			
		||||
    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
 | 
			
		||||
  } \
 | 
			
		||||
  while (0)
 | 
			
		||||
  } while (0)
 | 
			
		||||
 | 
			
		||||
static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
  uint32_t *im1;
 | 
			
		||||
@@ -340,10 +322,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
  int j;
 | 
			
		||||
  double ssim_c2 = SSIM_C2;
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
  if (bit_depth == 10)
 | 
			
		||||
    ssim_c2 = SSIM_C2_10;
 | 
			
		||||
  if (bit_depth == 12)
 | 
			
		||||
    ssim_c2 = SSIM_C2_12;
 | 
			
		||||
  if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
 | 
			
		||||
  if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
 | 
			
		||||
#else
 | 
			
		||||
  assert(bit_depth == 8);
 | 
			
		||||
#endif
 | 
			
		||||
@@ -398,14 +378,11 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
        double mugy2;
 | 
			
		||||
        double mugxgy;
 | 
			
		||||
        mugx2 = col_sums_gx2[0];
 | 
			
		||||
        for (k = 1; k < 8; k++)
 | 
			
		||||
          mugx2 += col_sums_gx2[k];
 | 
			
		||||
        for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
 | 
			
		||||
        mugy2 = col_sums_gy2[0];
 | 
			
		||||
        for (k = 1; k < 8; k++)
 | 
			
		||||
          mugy2 += col_sums_gy2[k];
 | 
			
		||||
        for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
 | 
			
		||||
        mugxgy = col_sums_gxgy[0];
 | 
			
		||||
        for (k = 1; k < 8; k++)
 | 
			
		||||
          mugxgy += col_sums_gxgy[k];
 | 
			
		||||
        for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
 | 
			
		||||
        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
 | 
			
		||||
        if (i + 1 < w) {
 | 
			
		||||
          FS_COL_SET(0, -1, 1);
 | 
			
		||||
@@ -440,8 +417,9 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
 | 
			
		||||
 Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
 | 
			
		||||
 We drop the finest scale and renormalize the rest to sum to 1.*/
 | 
			
		||||
 | 
			
		||||
static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
 | 
			
		||||
    0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
 | 
			
		||||
static const double FS_WEIGHTS[FS_NLEVELS] = {
 | 
			
		||||
  0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static double fs_average(fs_ctx *_ctx, int _l) {
 | 
			
		||||
  double *ssim;
 | 
			
		||||
@@ -455,28 +433,26 @@ static double fs_average(fs_ctx *_ctx, int _l) {
 | 
			
		||||
  ssim = _ctx->level[_l].ssim;
 | 
			
		||||
  ret = 0;
 | 
			
		||||
  for (j = 0; j < h; j++)
 | 
			
		||||
    for (i = 0; i < w; i++)
 | 
			
		||||
      ret += ssim[j * w + i];
 | 
			
		||||
    for (i = 0; i < w; i++) ret += ssim[j * w + i];
 | 
			
		||||
  return pow(ret / (w * h), FS_WEIGHTS[_l]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static double convert_ssim_db(double _ssim, double _weight) {
 | 
			
		||||
  assert(_weight >= _ssim);
 | 
			
		||||
  if ((_weight - _ssim) < 1e-10)
 | 
			
		||||
    return MAX_SSIM_DB;
 | 
			
		||||
  if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
 | 
			
		||||
  return 10 * (log10(_weight) - log10(_weight - _ssim));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static double calc_ssim(const uint8_t *_src, int _systride,
 | 
			
		||||
                        const uint8_t *_dst, int _dystride,
 | 
			
		||||
                        int _w, int _h, uint32_t _bd, uint32_t _shift) {
 | 
			
		||||
static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
 | 
			
		||||
                        int _dystride, int _w, int _h, uint32_t _bd,
 | 
			
		||||
                        uint32_t _shift) {
 | 
			
		||||
  fs_ctx ctx;
 | 
			
		||||
  double ret;
 | 
			
		||||
  int l;
 | 
			
		||||
  ret = 1;
 | 
			
		||||
  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
 | 
			
		||||
  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride,
 | 
			
		||||
                       _w, _h, _bd, _shift);
 | 
			
		||||
  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
 | 
			
		||||
                       _shift);
 | 
			
		||||
  for (l = 0; l < FS_NLEVELS - 1; l++) {
 | 
			
		||||
    fs_calc_structure(&ctx, l, _bd);
 | 
			
		||||
    ret *= fs_average(&ctx, l);
 | 
			
		||||
@@ -490,9 +466,9 @@ static double calc_ssim(const uint8_t *_src, int _systride,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
 | 
			
		||||
                         const YV12_BUFFER_CONFIG *dest,
 | 
			
		||||
                         double *ssim_y, double *ssim_u, double *ssim_v,
 | 
			
		||||
                         uint32_t bd, uint32_t in_bd) {
 | 
			
		||||
                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
 | 
			
		||||
                         double *ssim_u, double *ssim_v, uint32_t bd,
 | 
			
		||||
                         uint32_t in_bd) {
 | 
			
		||||
  double ssimv;
 | 
			
		||||
  uint32_t bd_shift = 0;
 | 
			
		||||
  vpx_clear_system_state();
 | 
			
		||||
 
 | 
			
		||||
@@ -72,8 +72,7 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
 | 
			
		||||
  {
 | 
			
		||||
    int i, j;
 | 
			
		||||
    for (i = 0; i < 4; ++i) {
 | 
			
		||||
      for (j = 0; j < 4; ++j)
 | 
			
		||||
        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
 | 
			
		||||
      for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -82,8 +81,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  tran_low_t sum = 0;
 | 
			
		||||
  for (r = 0; r < 4; ++r)
 | 
			
		||||
    for (c = 0; c < 4; ++c)
 | 
			
		||||
      sum += input[r * stride + c];
 | 
			
		||||
    for (c = 0; c < 4; ++c) sum += input[r * stride + c];
 | 
			
		||||
 | 
			
		||||
  output[0] = sum << 1;
 | 
			
		||||
}
 | 
			
		||||
@@ -169,8 +167,7 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
 | 
			
		||||
 | 
			
		||||
  // Rows
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      final_output[j + i * 8] /= 2;
 | 
			
		||||
    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -178,8 +175,7 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  tran_low_t sum = 0;
 | 
			
		||||
  for (r = 0; r < 8; ++r)
 | 
			
		||||
    for (c = 0; c < 8; ++c)
 | 
			
		||||
      sum += input[r * stride + c];
 | 
			
		||||
    for (c = 0; c < 8; ++c) sum += input[r * stride + c];
 | 
			
		||||
 | 
			
		||||
  output[0] = sum;
 | 
			
		||||
}
 | 
			
		||||
@@ -368,8 +364,7 @@ void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  int sum = 0;
 | 
			
		||||
  for (r = 0; r < 16; ++r)
 | 
			
		||||
    for (c = 0; c < 16; ++c)
 | 
			
		||||
      sum += input[r * stride + c];
 | 
			
		||||
    for (c = 0; c < 16; ++c) sum += input[r * stride + c];
 | 
			
		||||
 | 
			
		||||
  output[0] = (tran_low_t)(sum >> 1);
 | 
			
		||||
}
 | 
			
		||||
@@ -717,8 +712,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    tran_high_t temp_in[32], temp_out[32];
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = input[j * stride + i] * 4;
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
 | 
			
		||||
    vpx_fdct32(temp_in, temp_out, 0);
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
 | 
			
		||||
@@ -727,8 +721,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
 | 
			
		||||
  // Rows
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    tran_high_t temp_in[32], temp_out[32];
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = output[j + i * 32];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
 | 
			
		||||
    vpx_fdct32(temp_in, temp_out, 0);
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      out[j + i * 32] =
 | 
			
		||||
@@ -746,8 +739,7 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    tran_high_t temp_in[32], temp_out[32];
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = input[j * stride + i] * 4;
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
 | 
			
		||||
    vpx_fdct32(temp_in, temp_out, 0);
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      // TODO(cd): see quality impact of only doing
 | 
			
		||||
@@ -759,11 +751,9 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
 | 
			
		||||
  // Rows
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    tran_high_t temp_in[32], temp_out[32];
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = output[j + i * 32];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
 | 
			
		||||
    vpx_fdct32(temp_in, temp_out, 1);
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      out[j + i * 32] = (tran_low_t)temp_out[j];
 | 
			
		||||
    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -771,8 +761,7 @@ void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  int sum = 0;
 | 
			
		||||
  for (r = 0; r < 32; ++r)
 | 
			
		||||
    for (c = 0; c < 32; ++c)
 | 
			
		||||
      sum += input[r * stride + c];
 | 
			
		||||
    for (c = 0; c < 32; ++c) sum += input[r * stride + c];
 | 
			
		||||
 | 
			
		||||
  output[0] = (tran_low_t)(sum >> 3);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -23,8 +23,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  int r, c;
 | 
			
		||||
  (void)above;
 | 
			
		||||
  // first column
 | 
			
		||||
  for (r = 0; r < bs - 1; ++r)
 | 
			
		||||
    dst[r * stride] = AVG2(left[r], left[r + 1]);
 | 
			
		||||
  for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
 | 
			
		||||
  dst[(bs - 1) * stride] = left[bs - 1];
 | 
			
		||||
  dst++;
 | 
			
		||||
 | 
			
		||||
@@ -36,8 +35,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  dst++;
 | 
			
		||||
 | 
			
		||||
  // rest of last row
 | 
			
		||||
  for (c = 0; c < bs - 2; ++c)
 | 
			
		||||
    dst[(bs - 1) * stride + c] = left[bs - 1];
 | 
			
		||||
  for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
 | 
			
		||||
 | 
			
		||||
  for (r = bs - 2; r >= 0; --r)
 | 
			
		||||
    for (c = 0; c < bs - 2; ++c)
 | 
			
		||||
@@ -133,14 +131,12 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  int r, c;
 | 
			
		||||
 | 
			
		||||
  // first row
 | 
			
		||||
  for (c = 0; c < bs; c++)
 | 
			
		||||
    dst[c] = AVG2(above[c - 1], above[c]);
 | 
			
		||||
  for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
 | 
			
		||||
  dst += stride;
 | 
			
		||||
 | 
			
		||||
  // second row
 | 
			
		||||
  dst[0] = AVG3(left[0], above[-1], above[0]);
 | 
			
		||||
  for (c = 1; c < bs; c++)
 | 
			
		||||
    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 | 
			
		||||
  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 | 
			
		||||
  dst += stride;
 | 
			
		||||
 | 
			
		||||
  // the rest of first col
 | 
			
		||||
@@ -150,8 +146,7 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
 | 
			
		||||
  // the rest of the block
 | 
			
		||||
  for (r = 2; r < bs; ++r) {
 | 
			
		||||
    for (c = 1; c < bs; c++)
 | 
			
		||||
      dst[c] = dst[-2 * stride + c - 1];
 | 
			
		||||
    for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
 | 
			
		||||
    dst += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -188,8 +183,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
                                  const uint8_t *above, const uint8_t *left) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  dst[0] = AVG2(above[-1], left[0]);
 | 
			
		||||
  for (r = 1; r < bs; r++)
 | 
			
		||||
    dst[r * stride] = AVG2(left[r - 1], left[r]);
 | 
			
		||||
  for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
 | 
			
		||||
  dst++;
 | 
			
		||||
 | 
			
		||||
  dst[0] = AVG3(left[0], above[-1], above[0]);
 | 
			
		||||
@@ -203,8 +197,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  dst += stride;
 | 
			
		||||
 | 
			
		||||
  for (r = 1; r < bs; ++r) {
 | 
			
		||||
    for (c = 0; c < bs - 2; c++)
 | 
			
		||||
      dst[c] = dst[-stride + c - 2];
 | 
			
		||||
    for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
 | 
			
		||||
    dst += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -261,8 +254,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  int i, r, expected_dc, sum = 0;
 | 
			
		||||
  (void)above;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < bs; i++)
 | 
			
		||||
    sum += left[i];
 | 
			
		||||
  for (i = 0; i < bs; i++) sum += left[i];
 | 
			
		||||
  expected_dc = (sum + (bs >> 1)) / bs;
 | 
			
		||||
 | 
			
		||||
  for (r = 0; r < bs; r++) {
 | 
			
		||||
@@ -276,8 +268,7 @@ static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  int i, r, expected_dc, sum = 0;
 | 
			
		||||
  (void)left;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < bs; i++)
 | 
			
		||||
    sum += above[i];
 | 
			
		||||
  for (i = 0; i < bs; i++) sum += above[i];
 | 
			
		||||
  expected_dc = (sum + (bs >> 1)) / bs;
 | 
			
		||||
 | 
			
		||||
  for (r = 0; r < bs; r++) {
 | 
			
		||||
@@ -350,8 +341,7 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
  DST(1, 0) = AVG3(I, J, K);
 | 
			
		||||
  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
 | 
			
		||||
  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
 | 
			
		||||
  DST(3, 2) = DST(2, 2) =
 | 
			
		||||
      DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 | 
			
		||||
  DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -535,8 +525,7 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  dst++;
 | 
			
		||||
 | 
			
		||||
  // Rest of last row.
 | 
			
		||||
  for (c = 0; c < bs - 2; ++c)
 | 
			
		||||
    dst[(bs - 1) * stride + c] = left[bs - 1];
 | 
			
		||||
  for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
 | 
			
		||||
 | 
			
		||||
  for (r = bs - 2; r >= 0; --r) {
 | 
			
		||||
    for (c = 0; c < bs - 2; ++c)
 | 
			
		||||
@@ -563,8 +552,8 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
}
 | 
			
		||||
#endif  // CONFIG_MISC_FIXES
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
                                        int bs, const uint16_t *above,
 | 
			
		||||
static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
                                        const uint16_t *above,
 | 
			
		||||
                                        const uint16_t *left, int bd) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  (void)left;
 | 
			
		||||
@@ -589,8 +578,8 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
  (void)bd;
 | 
			
		||||
  for (r = 0; r < bs; ++r) {
 | 
			
		||||
    for (c = 0; c < bs; ++c) {
 | 
			
		||||
      dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
 | 
			
		||||
                                         above[r + c + 2])
 | 
			
		||||
      dst[c] = r + c + 2 < bs * 2
 | 
			
		||||
                   ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
 | 
			
		||||
                   : above[bs * 2 - 1];
 | 
			
		||||
    }
 | 
			
		||||
    dst += stride;
 | 
			
		||||
@@ -621,14 +610,12 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  (void)bd;
 | 
			
		||||
 | 
			
		||||
  // first row
 | 
			
		||||
  for (c = 0; c < bs; c++)
 | 
			
		||||
    dst[c] = AVG2(above[c - 1], above[c]);
 | 
			
		||||
  for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
 | 
			
		||||
  dst += stride;
 | 
			
		||||
 | 
			
		||||
  // second row
 | 
			
		||||
  dst[0] = AVG3(left[0], above[-1], above[0]);
 | 
			
		||||
  for (c = 1; c < bs; c++)
 | 
			
		||||
    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 | 
			
		||||
  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 | 
			
		||||
  dst += stride;
 | 
			
		||||
 | 
			
		||||
  // the rest of first col
 | 
			
		||||
@@ -638,8 +625,7 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
 | 
			
		||||
  // the rest of the block
 | 
			
		||||
  for (r = 2; r < bs; ++r) {
 | 
			
		||||
    for (c = 1; c < bs; c++)
 | 
			
		||||
      dst[c] = dst[-2 * stride + c - 1];
 | 
			
		||||
    for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
 | 
			
		||||
    dst += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -650,8 +636,7 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  int r, c;
 | 
			
		||||
  (void)bd;
 | 
			
		||||
  dst[0] = AVG3(left[0], above[-1], above[0]);
 | 
			
		||||
  for (c = 1; c < bs; c++)
 | 
			
		||||
    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 | 
			
		||||
  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 | 
			
		||||
 | 
			
		||||
  dst[stride] = AVG3(above[-1], left[0], left[1]);
 | 
			
		||||
  for (r = 2; r < bs; ++r)
 | 
			
		||||
@@ -659,8 +644,7 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
 | 
			
		||||
  dst += stride;
 | 
			
		||||
  for (r = 1; r < bs; ++r) {
 | 
			
		||||
    for (c = 1; c < bs; c++)
 | 
			
		||||
      dst[c] = dst[-stride + c - 1];
 | 
			
		||||
    for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
 | 
			
		||||
    dst += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -671,8 +655,7 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  int r, c;
 | 
			
		||||
  (void)bd;
 | 
			
		||||
  dst[0] = AVG2(above[-1], left[0]);
 | 
			
		||||
  for (r = 1; r < bs; r++)
 | 
			
		||||
    dst[r * stride] = AVG2(left[r - 1], left[r]);
 | 
			
		||||
  for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
 | 
			
		||||
  dst++;
 | 
			
		||||
 | 
			
		||||
  dst[0] = AVG3(left[0], above[-1], above[0]);
 | 
			
		||||
@@ -686,14 +669,13 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  dst += stride;
 | 
			
		||||
 | 
			
		||||
  for (r = 1; r < bs; ++r) {
 | 
			
		||||
    for (c = 0; c < bs - 2; c++)
 | 
			
		||||
      dst[c] = dst[-stride + c - 2];
 | 
			
		||||
    for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
 | 
			
		||||
    dst += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
                                      int bs, const uint16_t *above,
 | 
			
		||||
static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
                                      const uint16_t *above,
 | 
			
		||||
                                      const uint16_t *left, int bd) {
 | 
			
		||||
  int r;
 | 
			
		||||
  (void)left;
 | 
			
		||||
@@ -704,8 +686,8 @@ static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
                                      int bs, const uint16_t *above,
 | 
			
		||||
static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
                                      const uint16_t *above,
 | 
			
		||||
                                      const uint16_t *left, int bd) {
 | 
			
		||||
  int r;
 | 
			
		||||
  (void)above;
 | 
			
		||||
@@ -716,8 +698,8 @@ static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
                                       int bs, const uint16_t *above,
 | 
			
		||||
static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
                                       const uint16_t *above,
 | 
			
		||||
                                       const uint16_t *left, int bd) {
 | 
			
		||||
  int r, c;
 | 
			
		||||
  int ytop_left = above[-1];
 | 
			
		||||
@@ -750,8 +732,7 @@ static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  (void)above;
 | 
			
		||||
  (void)bd;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < bs; i++)
 | 
			
		||||
    sum += left[i];
 | 
			
		||||
  for (i = 0; i < bs; i++) sum += left[i];
 | 
			
		||||
  expected_dc = (sum + (bs >> 1)) / bs;
 | 
			
		||||
 | 
			
		||||
  for (r = 0; r < bs; r++) {
 | 
			
		||||
@@ -767,8 +748,7 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  (void)left;
 | 
			
		||||
  (void)bd;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < bs; i++)
 | 
			
		||||
    sum += above[i];
 | 
			
		||||
  for (i = 0; i < bs; i++) sum += above[i];
 | 
			
		||||
  expected_dc = (sum + (bs >> 1)) / bs;
 | 
			
		||||
 | 
			
		||||
  for (r = 0; r < bs; r++) {
 | 
			
		||||
@@ -777,8 +757,8 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
                                       int bs, const uint16_t *above,
 | 
			
		||||
static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 | 
			
		||||
                                       const uint16_t *above,
 | 
			
		||||
                                       const uint16_t *left, int bd) {
 | 
			
		||||
  int i, r, expected_dc, sum = 0;
 | 
			
		||||
  const int count = 2 * bs;
 | 
			
		||||
@@ -802,9 +782,8 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
// can be unified and accessed as a pointer array. Note that the boundary
 | 
			
		||||
// above and left are not necessarily used all the time.
 | 
			
		||||
#define intra_pred_sized(type, size)                        \
 | 
			
		||||
  void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
 | 
			
		||||
                                                  ptrdiff_t stride, \
 | 
			
		||||
                                                  const uint8_t *above, \
 | 
			
		||||
  void vpx_##type##_predictor_##size##x##size##_c(          \
 | 
			
		||||
      uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
 | 
			
		||||
      const uint8_t *left) {                                \
 | 
			
		||||
    type##_predictor(dst, stride, size, above, left);       \
 | 
			
		||||
  }
 | 
			
		||||
@@ -817,6 +796,7 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
 | 
			
		||||
    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
/* clang-format off */
 | 
			
		||||
#define intra_pred_allsizes(type) \
 | 
			
		||||
  intra_pred_sized(type, 4) \
 | 
			
		||||
  intra_pred_sized(type, 8) \
 | 
			
		||||
@@ -867,4 +847,5 @@ intra_pred_allsizes(dc_128)
 | 
			
		||||
intra_pred_allsizes(dc_left)
 | 
			
		||||
intra_pred_allsizes(dc_top)
 | 
			
		||||
intra_pred_allsizes(dc)
 | 
			
		||||
/* clang-format on */
 | 
			
		||||
#undef intra_pred_allsizes
 | 
			
		||||
 
 | 
			
		||||
@@ -127,8 +127,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 4; ++i) {
 | 
			
		||||
    for (j = 0; j < 4; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 4 + i];
 | 
			
		||||
    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
 | 
			
		||||
    idct4_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 4; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -223,8 +222,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
 | 
			
		||||
  // Then transform columns
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    idct8_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 8; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -240,8 +238,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
 | 
			
		||||
  a1 = ROUND_POWER_OF_TWO(out, 5);
 | 
			
		||||
  for (j = 0; j < 8; ++j) {
 | 
			
		||||
    for (i = 0; i < 8; ++i)
 | 
			
		||||
      dest[i] = clip_pixel_add(dest[i], a1);
 | 
			
		||||
    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
 | 
			
		||||
    dest += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -296,8 +293,8 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 | 
			
		||||
  tran_high_t x7 = input[6];
 | 
			
		||||
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4]
 | 
			
		||||
              = output[5] = output[6] = output[7] = 0;
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
 | 
			
		||||
        output[6] = output[7] = 0;
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -376,8 +373,7 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
 | 
			
		||||
  // Then transform columns
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    idct8_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 8; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -567,8 +563,7 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
  // Then transform columns
 | 
			
		||||
  for (i = 0; i < 16; ++i) {
 | 
			
		||||
    for (j = 0; j < 16; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 16 + i];
 | 
			
		||||
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 | 
			
		||||
    idct16_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 16; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -598,12 +593,11 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 | 
			
		||||
  tran_high_t x14 = input[1];
 | 
			
		||||
  tran_high_t x15 = input[14];
 | 
			
		||||
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
 | 
			
		||||
           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4]
 | 
			
		||||
              = output[5] = output[6] = output[7] = output[8]
 | 
			
		||||
              = output[9] = output[10] = output[11] = output[12]
 | 
			
		||||
              = output[13] = output[14] = output[15] = 0;
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
 | 
			
		||||
        x13 | x14 | x15)) {
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
 | 
			
		||||
        output[6] = output[7] = output[8] = output[9] = output[10] =
 | 
			
		||||
            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -766,8 +760,7 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
  // Then transform columns
 | 
			
		||||
  for (i = 0; i < 16; ++i) {
 | 
			
		||||
    for (j = 0; j < 16; ++j)
 | 
			
		||||
      temp_in[j] = out[j*16 + i];
 | 
			
		||||
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 | 
			
		||||
    idct16_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 16; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -783,8 +776,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
 | 
			
		||||
  a1 = ROUND_POWER_OF_TWO(out, 6);
 | 
			
		||||
  for (j = 0; j < 16; ++j) {
 | 
			
		||||
    for (i = 0; i < 16; ++i)
 | 
			
		||||
      dest[i] = clip_pixel_add(dest[i], a1);
 | 
			
		||||
    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
 | 
			
		||||
    dest += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -1166,8 +1158,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
  // Rows
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    int16_t zero_coeff[16];
 | 
			
		||||
    for (j = 0; j < 16; ++j)
 | 
			
		||||
      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
 | 
			
		||||
    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 | 
			
		||||
    for (j = 0; j < 4; ++j)
 | 
			
		||||
@@ -1185,8 +1176,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    idct32_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 32; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -1212,8 +1202,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    idct32_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 32; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -1239,8 +1228,7 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    idct32_c(temp_in, temp_out);
 | 
			
		||||
    for (j = 0; j < 32; ++j) {
 | 
			
		||||
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 | 
			
		||||
@@ -1258,8 +1246,7 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 | 
			
		||||
  a1 = ROUND_POWER_OF_TWO(out, 6);
 | 
			
		||||
 | 
			
		||||
  for (j = 0; j < 32; ++j) {
 | 
			
		||||
    for (i = 0; i < 32; ++i)
 | 
			
		||||
      dest[i] = clip_pixel_add(dest[i], a1);
 | 
			
		||||
    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
 | 
			
		||||
    dest += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -1309,14 +1296,14 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
    c1 = e1 - c1;
 | 
			
		||||
    a1 -= b1;
 | 
			
		||||
    d1 += c1;
 | 
			
		||||
    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
 | 
			
		||||
                                             HIGHBD_WRAPLOW(a1, bd), bd);
 | 
			
		||||
    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
 | 
			
		||||
                                             HIGHBD_WRAPLOW(b1, bd), bd);
 | 
			
		||||
    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
 | 
			
		||||
                                             HIGHBD_WRAPLOW(c1, bd), bd);
 | 
			
		||||
    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
 | 
			
		||||
                                             HIGHBD_WRAPLOW(d1, bd), bd);
 | 
			
		||||
    dest[stride * 0] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
 | 
			
		||||
    dest[stride * 1] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
 | 
			
		||||
    dest[stride * 2] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
 | 
			
		||||
    dest[stride * 3] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
 | 
			
		||||
 | 
			
		||||
    ip++;
 | 
			
		||||
    dest++;
 | 
			
		||||
@@ -1343,14 +1330,14 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
 | 
			
		||||
  for (i = 0; i < 4; i++) {
 | 
			
		||||
    e1 = ip[0] >> 1;
 | 
			
		||||
    a1 = ip[0] - e1;
 | 
			
		||||
    dest[dest_stride * 0] = highbd_clip_pixel_add(
 | 
			
		||||
        dest[dest_stride * 0], a1, bd);
 | 
			
		||||
    dest[dest_stride * 1] = highbd_clip_pixel_add(
 | 
			
		||||
        dest[dest_stride * 1], e1, bd);
 | 
			
		||||
    dest[dest_stride * 2] = highbd_clip_pixel_add(
 | 
			
		||||
        dest[dest_stride * 2], e1, bd);
 | 
			
		||||
    dest[dest_stride * 3] = highbd_clip_pixel_add(
 | 
			
		||||
        dest[dest_stride * 3], e1, bd);
 | 
			
		||||
    dest[dest_stride * 0] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
 | 
			
		||||
    dest[dest_stride * 1] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
 | 
			
		||||
    dest[dest_stride * 2] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
 | 
			
		||||
    dest[dest_stride * 3] =
 | 
			
		||||
        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
 | 
			
		||||
    ip++;
 | 
			
		||||
    dest++;
 | 
			
		||||
  }
 | 
			
		||||
@@ -1394,8 +1381,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 4; ++i) {
 | 
			
		||||
    for (j = 0; j < 4; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 4 + i];
 | 
			
		||||
    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
 | 
			
		||||
    vpx_highbd_idct4_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 4; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -1408,8 +1394,8 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
                                int dest_stride, int bd) {
 | 
			
		||||
  int i;
 | 
			
		||||
  tran_high_t a1;
 | 
			
		||||
  tran_low_t out = HIGHBD_WRAPLOW(
 | 
			
		||||
      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  tran_low_t out =
 | 
			
		||||
      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 | 
			
		||||
 | 
			
		||||
  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
 | 
			
		||||
@@ -1486,8 +1472,7 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
 | 
			
		||||
  // Then transform columns.
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    vpx_highbd_idct8_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 8; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -1500,14 +1485,13 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
                                int stride, int bd) {
 | 
			
		||||
  int i, j;
 | 
			
		||||
  tran_high_t a1;
 | 
			
		||||
  tran_low_t out = HIGHBD_WRAPLOW(
 | 
			
		||||
      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  tran_low_t out =
 | 
			
		||||
      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 | 
			
		||||
  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
 | 
			
		||||
  a1 = ROUND_POWER_OF_TWO(out, 5);
 | 
			
		||||
  for (j = 0; j < 8; ++j) {
 | 
			
		||||
    for (i = 0; i < 8; ++i)
 | 
			
		||||
      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
 | 
			
		||||
    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
 | 
			
		||||
    dest += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -1644,8 +1628,7 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
  }
 | 
			
		||||
  // Then transform columns.
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
 | 
			
		||||
    vpx_highbd_idct8_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 8; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -1837,8 +1820,7 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
 | 
			
		||||
  // Then transform columns.
 | 
			
		||||
  for (i = 0; i < 16; ++i) {
 | 
			
		||||
    for (j = 0; j < 16; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 16 + i];
 | 
			
		||||
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 | 
			
		||||
    vpx_highbd_idct16_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 16; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -1869,8 +1851,8 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 | 
			
		||||
  tran_low_t x15 = input[14];
 | 
			
		||||
  (void)bd;
 | 
			
		||||
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
 | 
			
		||||
           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
 | 
			
		||||
        x13 | x14 | x15)) {
 | 
			
		||||
    memset(output, 0, 16 * sizeof(*output));
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
@@ -2035,8 +2017,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
 | 
			
		||||
  // Then transform columns.
 | 
			
		||||
  for (i = 0; i < 16; ++i) {
 | 
			
		||||
    for (j = 0; j < 16; ++j)
 | 
			
		||||
      temp_in[j] = out[j*16 + i];
 | 
			
		||||
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
 | 
			
		||||
    vpx_highbd_idct16_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 16; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -2049,21 +2030,20 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
                                  int stride, int bd) {
 | 
			
		||||
  int i, j;
 | 
			
		||||
  tran_high_t a1;
 | 
			
		||||
  tran_low_t out = HIGHBD_WRAPLOW(
 | 
			
		||||
      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  tran_low_t out =
 | 
			
		||||
      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 | 
			
		||||
 | 
			
		||||
  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
 | 
			
		||||
  a1 = ROUND_POWER_OF_TWO(out, 6);
 | 
			
		||||
  for (j = 0; j < 16; ++j) {
 | 
			
		||||
    for (i = 0; i < 16; ++i)
 | 
			
		||||
      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
 | 
			
		||||
    for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
 | 
			
		||||
    dest += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void highbd_idct32_c(const tran_low_t *input,
 | 
			
		||||
                            tran_low_t *output, int bd) {
 | 
			
		||||
static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 | 
			
		||||
                            int bd) {
 | 
			
		||||
  tran_low_t step1[32], step2[32];
 | 
			
		||||
  tran_high_t temp1, temp2;
 | 
			
		||||
  (void)bd;
 | 
			
		||||
@@ -2442,8 +2422,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
  // Rows
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    tran_low_t zero_coeff[16];
 | 
			
		||||
    for (j = 0; j < 16; ++j)
 | 
			
		||||
      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
 | 
			
		||||
    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
 | 
			
		||||
    for (j = 0; j < 8; ++j)
 | 
			
		||||
      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 | 
			
		||||
    for (j = 0; j < 4; ++j)
 | 
			
		||||
@@ -2461,8 +2440,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    highbd_idct32_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 32; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -2488,8 +2466,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
  }
 | 
			
		||||
  // Columns
 | 
			
		||||
  for (i = 0; i < 32; ++i) {
 | 
			
		||||
    for (j = 0; j < 32; ++j)
 | 
			
		||||
      temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
 | 
			
		||||
    highbd_idct32_c(temp_in, temp_out, bd);
 | 
			
		||||
    for (j = 0; j < 32; ++j) {
 | 
			
		||||
      dest[j * stride + i] = highbd_clip_pixel_add(
 | 
			
		||||
@@ -2504,14 +2481,13 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
 | 
			
		||||
  int a1;
 | 
			
		||||
  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 | 
			
		||||
 | 
			
		||||
  tran_low_t out = HIGHBD_WRAPLOW(
 | 
			
		||||
      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  tran_low_t out =
 | 
			
		||||
      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
 | 
			
		||||
  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
 | 
			
		||||
  a1 = ROUND_POWER_OF_TWO(out, 6);
 | 
			
		||||
 | 
			
		||||
  for (j = 0; j < 32; ++j) {
 | 
			
		||||
    for (i = 0; i < 32; ++i)
 | 
			
		||||
      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
 | 
			
		||||
    for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
 | 
			
		||||
    dest += stride;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -41,8 +41,7 @@ static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
static INLINE tran_high_t highbd_check_range(tran_high_t input,
 | 
			
		||||
                                             int bd) {
 | 
			
		||||
static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
 | 
			
		||||
#if CONFIG_COEFFICIENT_RANGE_CHECKING
 | 
			
		||||
  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
 | 
			
		||||
  // stay within the ranges:
 | 
			
		||||
@@ -93,8 +92,7 @@ static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
 | 
			
		||||
 | 
			
		||||
#define WRAPLOW(x) ((int32_t)check_range(x))
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
#define HIGHBD_WRAPLOW(x, bd) \
 | 
			
		||||
    ((int32_t)highbd_check_range((x), bd))
 | 
			
		||||
#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
 | 
			
		||||
#endif  // CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
#endif  // CONFIG_EMULATE_HARDWARE
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -22,23 +22,18 @@ static INLINE int8_t signed_char_clamp(int t) {
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
static INLINE int16_t signed_char_clamp_high(int t, int bd) {
 | 
			
		||||
  switch (bd) {
 | 
			
		||||
    case 10:
 | 
			
		||||
      return (int16_t)clamp(t, -128*4, 128*4-1);
 | 
			
		||||
    case 12:
 | 
			
		||||
      return (int16_t)clamp(t, -128*16, 128*16-1);
 | 
			
		||||
    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
 | 
			
		||||
    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
 | 
			
		||||
    case 8:
 | 
			
		||||
    default:
 | 
			
		||||
      return (int16_t)clamp(t, -128, 128-1);
 | 
			
		||||
    default: return (int16_t)clamp(t, -128, 128 - 1);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// should we apply any filter at all: 11111111 yes, 00000000 no
 | 
			
		||||
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
 | 
			
		||||
                                 uint8_t p3, uint8_t p2,
 | 
			
		||||
                                 uint8_t p1, uint8_t p0,
 | 
			
		||||
                                 uint8_t q0, uint8_t q1,
 | 
			
		||||
                                 uint8_t q2, uint8_t q3) {
 | 
			
		||||
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
 | 
			
		||||
                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
 | 
			
		||||
                                 uint8_t q1, uint8_t q2, uint8_t q3) {
 | 
			
		||||
  int8_t mask = 0;
 | 
			
		||||
  mask |= (abs(p3 - p2) > limit) * -1;
 | 
			
		||||
  mask |= (abs(p2 - p1) > limit) * -1;
 | 
			
		||||
@@ -50,10 +45,8 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
 | 
			
		||||
  return ~mask;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE int8_t flat_mask4(uint8_t thresh,
 | 
			
		||||
                                uint8_t p3, uint8_t p2,
 | 
			
		||||
                                uint8_t p1, uint8_t p0,
 | 
			
		||||
                                uint8_t q0, uint8_t q1,
 | 
			
		||||
static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
 | 
			
		||||
                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
 | 
			
		||||
                                uint8_t q2, uint8_t q3) {
 | 
			
		||||
  int8_t mask = 0;
 | 
			
		||||
  mask |= (abs(p1 - p0) > thresh) * -1;
 | 
			
		||||
@@ -65,12 +58,10 @@ static INLINE int8_t flat_mask4(uint8_t thresh,
 | 
			
		||||
  return ~mask;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE int8_t flat_mask5(uint8_t thresh,
 | 
			
		||||
                                uint8_t p4, uint8_t p3,
 | 
			
		||||
                                uint8_t p2, uint8_t p1,
 | 
			
		||||
                                uint8_t p0, uint8_t q0,
 | 
			
		||||
                                uint8_t q1, uint8_t q2,
 | 
			
		||||
                                uint8_t q3, uint8_t q4) {
 | 
			
		||||
static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
 | 
			
		||||
                                uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
 | 
			
		||||
                                uint8_t q1, uint8_t q2, uint8_t q3,
 | 
			
		||||
                                uint8_t q4) {
 | 
			
		||||
  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
  mask |= (abs(p4 - p0) > thresh) * -1;
 | 
			
		||||
  mask |= (abs(q4 - q0) > thresh) * -1;
 | 
			
		||||
@@ -128,8 +119,8 @@ void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
 | 
			
		||||
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 | 
			
		||||
    const int8_t mask = filter_mask(*limit, *blimit,
 | 
			
		||||
                                    p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
 | 
			
		||||
    ++s;
 | 
			
		||||
  }
 | 
			
		||||
@@ -152,8 +143,8 @@ void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
 | 
			
		||||
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
 | 
			
		||||
    const int8_t mask = filter_mask(*limit, *blimit,
 | 
			
		||||
                                    p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
 | 
			
		||||
    s += pitch;
 | 
			
		||||
  }
 | 
			
		||||
@@ -168,9 +159,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
 | 
			
		||||
                           uint8_t *op3, uint8_t *op2,
 | 
			
		||||
                           uint8_t *op1, uint8_t *op0,
 | 
			
		||||
                           uint8_t *oq0, uint8_t *oq1,
 | 
			
		||||
                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
 | 
			
		||||
                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
 | 
			
		||||
                           uint8_t *oq2, uint8_t *oq3) {
 | 
			
		||||
  if (flat && mask) {
 | 
			
		||||
    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
 | 
			
		||||
@@ -198,11 +188,11 @@ void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
 | 
			
		||||
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 | 
			
		||||
 | 
			
		||||
    const int8_t mask = filter_mask(*limit, *blimit,
 | 
			
		||||
                                    p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
 | 
			
		||||
                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
 | 
			
		||||
    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
 | 
			
		||||
            s + 1 * p, s + 2 * p, s + 3 * p);
 | 
			
		||||
    ++s;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -222,11 +212,11 @@ void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
 | 
			
		||||
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
 | 
			
		||||
    const int8_t mask = filter_mask(*limit, *blimit,
 | 
			
		||||
                                    p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
 | 
			
		||||
                                 s,     s + 1, s + 2, s + 3);
 | 
			
		||||
    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
 | 
			
		||||
            s + 3);
 | 
			
		||||
    s += pitch;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -239,52 +229,55 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
 | 
			
		||||
  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void filter16(int8_t mask, uint8_t thresh,
 | 
			
		||||
                            uint8_t flat, uint8_t flat2,
 | 
			
		||||
                            uint8_t *op7, uint8_t *op6,
 | 
			
		||||
                            uint8_t *op5, uint8_t *op4,
 | 
			
		||||
                            uint8_t *op3, uint8_t *op2,
 | 
			
		||||
                            uint8_t *op1, uint8_t *op0,
 | 
			
		||||
                            uint8_t *oq0, uint8_t *oq1,
 | 
			
		||||
                            uint8_t *oq2, uint8_t *oq3,
 | 
			
		||||
                            uint8_t *oq4, uint8_t *oq5,
 | 
			
		||||
static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
 | 
			
		||||
                            uint8_t flat2, uint8_t *op7, uint8_t *op6,
 | 
			
		||||
                            uint8_t *op5, uint8_t *op4, uint8_t *op3,
 | 
			
		||||
                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
 | 
			
		||||
                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
 | 
			
		||||
                            uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
 | 
			
		||||
                            uint8_t *oq6, uint8_t *oq7) {
 | 
			
		||||
  if (flat2 && flat && mask) {
 | 
			
		||||
    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
 | 
			
		||||
                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
 | 
			
		||||
    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
 | 
			
		||||
                  p2 = *op2, p1 = *op1, p0 = *op0;
 | 
			
		||||
 | 
			
		||||
    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
 | 
			
		||||
                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
 | 
			
		||||
    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
 | 
			
		||||
                  q5 = *oq5, q6 = *oq6, q7 = *oq7;
 | 
			
		||||
 | 
			
		||||
    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
 | 
			
		||||
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0, 4);
 | 
			
		||||
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1, 4);
 | 
			
		||||
    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2, 4);
 | 
			
		||||
    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3, 4);
 | 
			
		||||
    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4, 4);
 | 
			
		||||
    *op6 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
 | 
			
		||||
    *op5 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
 | 
			
		||||
    *op4 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
 | 
			
		||||
    *op3 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
 | 
			
		||||
    *op2 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
 | 
			
		||||
        4);
 | 
			
		||||
    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5, 4);
 | 
			
		||||
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
 | 
			
		||||
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
 | 
			
		||||
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
 | 
			
		||||
    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
 | 
			
		||||
    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
 | 
			
		||||
    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
 | 
			
		||||
    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
 | 
			
		||||
    *oq6 = ROUND_POWER_OF_TWO(p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
 | 
			
		||||
                                  q0 + q1 + q2 + q3 + q4 + q5,
 | 
			
		||||
                              4);
 | 
			
		||||
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
 | 
			
		||||
                                  q1 + q2 + q3 + q4 + q5 + q6,
 | 
			
		||||
                              4);
 | 
			
		||||
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
 | 
			
		||||
                                  q2 + q3 + q4 + q5 + q6 + q7,
 | 
			
		||||
                              4);
 | 
			
		||||
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
 | 
			
		||||
                                  q3 + q4 + q5 + q6 + q7 * 2,
 | 
			
		||||
                              4);
 | 
			
		||||
    *oq2 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
 | 
			
		||||
        4);
 | 
			
		||||
    *oq3 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
 | 
			
		||||
    *oq4 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
 | 
			
		||||
    *oq5 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
 | 
			
		||||
    *oq6 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
 | 
			
		||||
  } else {
 | 
			
		||||
    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
 | 
			
		||||
  }
 | 
			
		||||
@@ -300,18 +293,17 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
  for (i = 0; i < 8 * count; ++i) {
 | 
			
		||||
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
 | 
			
		||||
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 | 
			
		||||
    const int8_t mask = filter_mask(*limit, *blimit,
 | 
			
		||||
                                    p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t flat2 = flat_mask5(1,
 | 
			
		||||
                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
 | 
			
		||||
                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 | 
			
		||||
    const int8_t flat2 =
 | 
			
		||||
        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
 | 
			
		||||
                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 | 
			
		||||
 | 
			
		||||
    filter16(mask, *thresh, flat, flat2,
 | 
			
		||||
             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
 | 
			
		||||
             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
 | 
			
		||||
             s,         s + 1 * p, s + 2 * p, s + 3 * p,
 | 
			
		||||
             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
 | 
			
		||||
    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
 | 
			
		||||
             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
 | 
			
		||||
             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
 | 
			
		||||
             s + 7 * p);
 | 
			
		||||
    ++s;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -326,25 +318,23 @@ void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
 | 
			
		||||
                                   const uint8_t *blimit,
 | 
			
		||||
                                   const uint8_t *limit,
 | 
			
		||||
                                   const uint8_t *thresh,
 | 
			
		||||
static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
                                   const uint8_t *limit, const uint8_t *thresh,
 | 
			
		||||
                                   int count) {
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < count; ++i) {
 | 
			
		||||
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
 | 
			
		||||
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
 | 
			
		||||
    const int8_t mask = filter_mask(*limit, *blimit,
 | 
			
		||||
                                    p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
 | 
			
		||||
                                    q0, s[4], s[5], s[6], s[7]);
 | 
			
		||||
    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
 | 
			
		||||
                                    s[5], s[6], s[7]);
 | 
			
		||||
 | 
			
		||||
    filter16(mask, *thresh, flat, flat2,
 | 
			
		||||
             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
 | 
			
		||||
             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
 | 
			
		||||
    filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
 | 
			
		||||
             s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
 | 
			
		||||
             s + 7);
 | 
			
		||||
    s += p;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -362,9 +352,8 @@ void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
#if CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
 | 
			
		||||
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
 | 
			
		||||
                                        uint16_t p3, uint16_t p2,
 | 
			
		||||
                                        uint16_t p1, uint16_t p0,
 | 
			
		||||
                                        uint16_t q0, uint16_t q1,
 | 
			
		||||
                                        uint16_t p3, uint16_t p2, uint16_t p1,
 | 
			
		||||
                                        uint16_t p0, uint16_t q0, uint16_t q1,
 | 
			
		||||
                                        uint16_t q2, uint16_t q3, int bd) {
 | 
			
		||||
  int8_t mask = 0;
 | 
			
		||||
  int16_t limit16 = (uint16_t)limit << (bd - 8);
 | 
			
		||||
@@ -379,11 +368,10 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
 | 
			
		||||
  return ~mask;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
 | 
			
		||||
                                       uint16_t p3, uint16_t p2,
 | 
			
		||||
                                       uint16_t p1, uint16_t p0,
 | 
			
		||||
                                       uint16_t q0, uint16_t q1,
 | 
			
		||||
                                       uint16_t q2, uint16_t q3, int bd) {
 | 
			
		||||
static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
 | 
			
		||||
                                       uint16_t p1, uint16_t p0, uint16_t q0,
 | 
			
		||||
                                       uint16_t q1, uint16_t q2, uint16_t q3,
 | 
			
		||||
                                       int bd) {
 | 
			
		||||
  int8_t mask = 0;
 | 
			
		||||
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
 | 
			
		||||
  mask |= (abs(p1 - p0) > thresh16) * -1;
 | 
			
		||||
@@ -395,11 +383,9 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
 | 
			
		||||
  return ~mask;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
 | 
			
		||||
                                       uint16_t p4, uint16_t p3,
 | 
			
		||||
                                       uint16_t p2, uint16_t p1,
 | 
			
		||||
                                       uint16_t p0, uint16_t q0,
 | 
			
		||||
                                       uint16_t q1, uint16_t q2,
 | 
			
		||||
static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
 | 
			
		||||
                                       uint16_t p2, uint16_t p1, uint16_t p0,
 | 
			
		||||
                                       uint16_t q0, uint16_t q1, uint16_t q2,
 | 
			
		||||
                                       uint16_t q3, uint16_t q4, int bd) {
 | 
			
		||||
  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
 | 
			
		||||
@@ -470,21 +456,17 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
 | 
			
		||||
    const uint16_t q1 = s[1 * p];
 | 
			
		||||
    const uint16_t q2 = s[2 * p];
 | 
			
		||||
    const uint16_t q3 = s[3 * p];
 | 
			
		||||
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
 | 
			
		||||
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
 | 
			
		||||
    ++s;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
 | 
			
		||||
                                        const uint8_t *blimit0,
 | 
			
		||||
                                        const uint8_t *limit0,
 | 
			
		||||
                                        const uint8_t *thresh0,
 | 
			
		||||
                                        const uint8_t *blimit1,
 | 
			
		||||
                                        const uint8_t *limit1,
 | 
			
		||||
                                        const uint8_t *thresh1,
 | 
			
		||||
                                        int bd) {
 | 
			
		||||
void vpx_highbd_lpf_horizontal_4_dual_c(
 | 
			
		||||
    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
 | 
			
		||||
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
 | 
			
		||||
    const uint8_t *thresh1, int bd) {
 | 
			
		||||
  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
 | 
			
		||||
  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
 | 
			
		||||
}
 | 
			
		||||
@@ -499,30 +481,25 @@ void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
 | 
			
		||||
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
 | 
			
		||||
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
 | 
			
		||||
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
 | 
			
		||||
    s += pitch;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
 | 
			
		||||
                                      const uint8_t *blimit0,
 | 
			
		||||
                                      const uint8_t *limit0,
 | 
			
		||||
                                      const uint8_t *thresh0,
 | 
			
		||||
                                      const uint8_t *blimit1,
 | 
			
		||||
                                      const uint8_t *limit1,
 | 
			
		||||
                                      const uint8_t *thresh1,
 | 
			
		||||
                                      int bd) {
 | 
			
		||||
void vpx_highbd_lpf_vertical_4_dual_c(
 | 
			
		||||
    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
 | 
			
		||||
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
 | 
			
		||||
    const uint8_t *thresh1, int bd) {
 | 
			
		||||
  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
 | 
			
		||||
  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
 | 
			
		||||
                              thresh1, bd);
 | 
			
		||||
  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
 | 
			
		||||
                              bd);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
 | 
			
		||||
                                  uint16_t *op3, uint16_t *op2,
 | 
			
		||||
                                  uint16_t *op1, uint16_t *op0,
 | 
			
		||||
                                  uint16_t *oq0, uint16_t *oq1,
 | 
			
		||||
                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
 | 
			
		||||
                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
 | 
			
		||||
                                  uint16_t *oq2, uint16_t *oq3, int bd) {
 | 
			
		||||
  if (flat && mask) {
 | 
			
		||||
    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
 | 
			
		||||
@@ -551,25 +528,20 @@ void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
 | 
			
		||||
    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 | 
			
		||||
 | 
			
		||||
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
 | 
			
		||||
                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
 | 
			
		||||
                                          bd);
 | 
			
		||||
    highbd_filter8(mask, *thresh, flat,
 | 
			
		||||
                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
 | 
			
		||||
                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat =
 | 
			
		||||
        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
 | 
			
		||||
                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
 | 
			
		||||
    ++s;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
 | 
			
		||||
                                        const uint8_t *blimit0,
 | 
			
		||||
                                        const uint8_t *limit0,
 | 
			
		||||
                                        const uint8_t *thresh0,
 | 
			
		||||
                                        const uint8_t *blimit1,
 | 
			
		||||
                                        const uint8_t *limit1,
 | 
			
		||||
                                        const uint8_t *thresh1,
 | 
			
		||||
                                        int bd) {
 | 
			
		||||
void vpx_highbd_lpf_horizontal_8_dual_c(
 | 
			
		||||
    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
 | 
			
		||||
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
 | 
			
		||||
    const uint8_t *thresh1, int bd) {
 | 
			
		||||
  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
 | 
			
		||||
  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
 | 
			
		||||
}
 | 
			
		||||
@@ -582,40 +554,31 @@ void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
 | 
			
		||||
  for (i = 0; i < 8; ++i) {
 | 
			
		||||
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
 | 
			
		||||
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
 | 
			
		||||
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
 | 
			
		||||
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
 | 
			
		||||
                                          bd);
 | 
			
		||||
    highbd_filter8(mask, *thresh, flat,
 | 
			
		||||
                 s - 4, s - 3, s - 2, s - 1,
 | 
			
		||||
                 s, s + 1, s + 2, s + 3,
 | 
			
		||||
                 bd);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat =
 | 
			
		||||
        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
 | 
			
		||||
                   s + 2, s + 3, bd);
 | 
			
		||||
    s += pitch;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
 | 
			
		||||
                                      const uint8_t *blimit0,
 | 
			
		||||
                                      const uint8_t *limit0,
 | 
			
		||||
                                      const uint8_t *thresh0,
 | 
			
		||||
                                      const uint8_t *blimit1,
 | 
			
		||||
                                      const uint8_t *limit1,
 | 
			
		||||
                                      const uint8_t *thresh1,
 | 
			
		||||
                                      int bd) {
 | 
			
		||||
void vpx_highbd_lpf_vertical_8_dual_c(
 | 
			
		||||
    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
 | 
			
		||||
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
 | 
			
		||||
    const uint8_t *thresh1, int bd) {
 | 
			
		||||
  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
 | 
			
		||||
  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
 | 
			
		||||
                              thresh1, bd);
 | 
			
		||||
  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
 | 
			
		||||
                              bd);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
 | 
			
		||||
                                   uint8_t flat, uint8_t flat2,
 | 
			
		||||
                                   uint16_t *op7, uint16_t *op6,
 | 
			
		||||
                                   uint16_t *op5, uint16_t *op4,
 | 
			
		||||
                                   uint16_t *op3, uint16_t *op2,
 | 
			
		||||
                                   uint16_t *op1, uint16_t *op0,
 | 
			
		||||
                                   uint16_t *oq0, uint16_t *oq1,
 | 
			
		||||
                                   uint16_t *oq2, uint16_t *oq3,
 | 
			
		||||
                                   uint16_t *oq4, uint16_t *oq5,
 | 
			
		||||
static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
 | 
			
		||||
                                   uint8_t flat2, uint16_t *op7, uint16_t *op6,
 | 
			
		||||
                                   uint16_t *op5, uint16_t *op4, uint16_t *op3,
 | 
			
		||||
                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
 | 
			
		||||
                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
 | 
			
		||||
                                   uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
 | 
			
		||||
                                   uint16_t *oq6, uint16_t *oq7, int bd) {
 | 
			
		||||
  if (flat2 && flat && mask) {
 | 
			
		||||
    const uint16_t p7 = *op7;
 | 
			
		||||
@@ -636,34 +599,40 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
 | 
			
		||||
    const uint16_t q7 = *oq7;
 | 
			
		||||
 | 
			
		||||
    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
 | 
			
		||||
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0, 4);
 | 
			
		||||
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1, 4);
 | 
			
		||||
    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2, 4);
 | 
			
		||||
    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3, 4);
 | 
			
		||||
    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4, 4);
 | 
			
		||||
    *op6 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
 | 
			
		||||
    *op5 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
 | 
			
		||||
    *op4 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
 | 
			
		||||
    *op3 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
 | 
			
		||||
    *op2 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
 | 
			
		||||
        4);
 | 
			
		||||
    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5, 4);
 | 
			
		||||
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
 | 
			
		||||
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
 | 
			
		||||
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
 | 
			
		||||
    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
 | 
			
		||||
    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
 | 
			
		||||
    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
 | 
			
		||||
    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
 | 
			
		||||
    *oq6 = ROUND_POWER_OF_TWO(p0 +
 | 
			
		||||
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
 | 
			
		||||
                                  q0 + q1 + q2 + q3 + q4 + q5,
 | 
			
		||||
                              4);
 | 
			
		||||
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
 | 
			
		||||
                                  q1 + q2 + q3 + q4 + q5 + q6,
 | 
			
		||||
                              4);
 | 
			
		||||
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
 | 
			
		||||
                                  q2 + q3 + q4 + q5 + q6 + q7,
 | 
			
		||||
                              4);
 | 
			
		||||
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
 | 
			
		||||
                                  q3 + q4 + q5 + q6 + q7 * 2,
 | 
			
		||||
                              4);
 | 
			
		||||
    *oq2 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
 | 
			
		||||
        4);
 | 
			
		||||
    *oq3 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
 | 
			
		||||
    *oq4 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
 | 
			
		||||
    *oq5 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
 | 
			
		||||
    *oq6 = ROUND_POWER_OF_TWO(
 | 
			
		||||
        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
 | 
			
		||||
  } else {
 | 
			
		||||
    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
 | 
			
		||||
                   bd);
 | 
			
		||||
@@ -673,8 +642,8 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
 | 
			
		||||
static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
 | 
			
		||||
                                            const uint8_t *blimit,
 | 
			
		||||
                                            const uint8_t *limit,
 | 
			
		||||
                                            const uint8_t *thresh,
 | 
			
		||||
                                            int count, int bd) {
 | 
			
		||||
                                            const uint8_t *thresh, int count,
 | 
			
		||||
                                            int bd) {
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  // loop filter designed to work using chars so that we can make maximum use
 | 
			
		||||
@@ -688,20 +657,18 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
 | 
			
		||||
    const uint16_t q1 = s[1 * p];
 | 
			
		||||
    const uint16_t q2 = s[2 * p];
 | 
			
		||||
    const uint16_t q3 = s[3 * p];
 | 
			
		||||
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
 | 
			
		||||
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
 | 
			
		||||
                                          bd);
 | 
			
		||||
    const int8_t flat2 = highbd_flat_mask5(
 | 
			
		||||
        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
 | 
			
		||||
        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat =
 | 
			
		||||
        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat2 =
 | 
			
		||||
        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
 | 
			
		||||
                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
 | 
			
		||||
 | 
			
		||||
    highbd_filter16(mask, *thresh, flat, flat2,
 | 
			
		||||
                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
 | 
			
		||||
                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
 | 
			
		||||
                    s, s + 1 * p, s + 2 * p, s + 3 * p,
 | 
			
		||||
                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
 | 
			
		||||
                    bd);
 | 
			
		||||
    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
 | 
			
		||||
                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
 | 
			
		||||
                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
 | 
			
		||||
                    s + 6 * p, s + 7 * p, bd);
 | 
			
		||||
    ++s;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -723,8 +690,8 @@ void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
 | 
			
		||||
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
 | 
			
		||||
                                          const uint8_t *blimit,
 | 
			
		||||
                                          const uint8_t *limit,
 | 
			
		||||
                                          const uint8_t *thresh,
 | 
			
		||||
                                          int count, int bd) {
 | 
			
		||||
                                          const uint8_t *thresh, int count,
 | 
			
		||||
                                          int bd) {
 | 
			
		||||
  int i;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < count; ++i) {
 | 
			
		||||
@@ -736,17 +703,16 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
 | 
			
		||||
    const uint16_t q1 = s[1];
 | 
			
		||||
    const uint16_t q2 = s[2];
 | 
			
		||||
    const uint16_t q3 = s[3];
 | 
			
		||||
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
 | 
			
		||||
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
 | 
			
		||||
                                          bd);
 | 
			
		||||
    const int8_t mask =
 | 
			
		||||
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat =
 | 
			
		||||
        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 | 
			
		||||
    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
 | 
			
		||||
                                           q0, s[4], s[5], s[6], s[7], bd);
 | 
			
		||||
 | 
			
		||||
    highbd_filter16(mask, *thresh, flat, flat2,
 | 
			
		||||
                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
 | 
			
		||||
                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
 | 
			
		||||
                    bd);
 | 
			
		||||
    highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
 | 
			
		||||
                    s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
 | 
			
		||||
                    s + 5, s + 6, s + 7, bd);
 | 
			
		||||
    s += p;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -760,8 +726,7 @@ void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
 | 
			
		||||
                                       const uint8_t *blimit,
 | 
			
		||||
                                       const uint8_t *limit,
 | 
			
		||||
                                       const uint8_t *thresh,
 | 
			
		||||
                                       int bd) {
 | 
			
		||||
                                       const uint8_t *thresh, int bd) {
 | 
			
		||||
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
 | 
			
		||||
}
 | 
			
		||||
#endif  // CONFIG_VP9_HIGHBITDEPTH
 | 
			
		||||
 
 | 
			
		||||
@@ -12,8 +12,8 @@
 | 
			
		||||
#include "./macros_msa.h"
 | 
			
		||||
 | 
			
		||||
void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
 | 
			
		||||
                             int blackclamp, int whiteclamp,
 | 
			
		||||
                             int width, int height, int32_t pitch) {
 | 
			
		||||
                             int blackclamp, int whiteclamp, int width,
 | 
			
		||||
                             int height, int32_t pitch) {
 | 
			
		||||
  uint32_t i, j;
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < height / 2; ++i) {
 | 
			
		||||
 
 | 
			
		||||
@@ -24,37 +24,21 @@ extern "C" {
 | 
			
		||||
extern uint8_t *vpx_ff_cropTbl;  // From "vpx_dsp/mips/intrapred4_dspr2.c"
 | 
			
		||||
 | 
			
		||||
static INLINE void prefetch_load(const unsigned char *src) {
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
      "pref   0,  0(%[src])   \n\t"
 | 
			
		||||
      :
 | 
			
		||||
      : [src] "r" (src)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* prefetch data for store */
 | 
			
		||||
static INLINE void prefetch_store(unsigned char *dst) {
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
      "pref   1,  0(%[dst])   \n\t"
 | 
			
		||||
      :
 | 
			
		||||
      : [dst] "r" (dst)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void prefetch_load_streamed(const unsigned char *src) {
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
      "pref   4,  0(%[src])   \n\t"
 | 
			
		||||
      :
 | 
			
		||||
      : [src] "r" (src)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("pref   4,  0(%[src])   \n\t" : : [src] "r"(src));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* prefetch data for store */
 | 
			
		||||
static INLINE void prefetch_store_streamed(unsigned char *dst) {
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
      "pref   5,  0(%[dst])   \n\t"
 | 
			
		||||
      :
 | 
			
		||||
      : [dst] "r" (dst)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("pref   5,  0(%[dst])   \n\t" : : [dst] "r"(dst));
 | 
			
		||||
}
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
#ifdef __cplusplus
 | 
			
		||||
 
 | 
			
		||||
@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                         int32_t src_stride,
 | 
			
		||||
                                         uint8_t *dst,
 | 
			
		||||
                                         int32_t dst_stride,
 | 
			
		||||
                                         const int16_t *filter_y,
 | 
			
		||||
                                         int32_t w,
 | 
			
		||||
static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                         uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                         const int16_t *filter_y, int32_t w,
 | 
			
		||||
                                         int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
@@ -105,16 +102,13 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
 | 
			
		||||
            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
 | 
			
		||||
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
 | 
			
		||||
            [src_stride] "r" (src_stride), [cm] "r" (cm),
 | 
			
		||||
            [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -124,11 +118,9 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
                                          int32_t src_stride,
 | 
			
		||||
                                          uint8_t *dst,
 | 
			
		||||
                                          int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                          int32_t dst_stride,
 | 
			
		||||
                                          const int16_t *filter_y,
 | 
			
		||||
                                          int32_t h) {
 | 
			
		||||
                                          const int16_t *filter_y, int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
  uint8_t *dst_ptr;
 | 
			
		||||
@@ -140,7 +132,7 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
  uint32_t store1, store2;
 | 
			
		||||
  int32_t Temp1, Temp2;
 | 
			
		||||
  const int16_t *filter = &filter_y[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -210,16 +202,13 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
 | 
			
		||||
            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
 | 
			
		||||
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
 | 
			
		||||
            [src_stride] "r" (src_stride), [cm] "r" (cm),
 | 
			
		||||
            [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -231,18 +220,16 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                  uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                  const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                                  const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                                  int w, int h) {
 | 
			
		||||
                                  const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                                  int h) {
 | 
			
		||||
  uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  prefetch_store(dst);
 | 
			
		||||
 | 
			
		||||
@@ -251,22 +238,17 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
    case 8:
 | 
			
		||||
    case 16:
 | 
			
		||||
    case 32:
 | 
			
		||||
      convolve_bi_avg_vert_4_dspr2(src, src_stride,
 | 
			
		||||
                                   dst, dst_stride,
 | 
			
		||||
                                   filter_y, w, h);
 | 
			
		||||
      convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
 | 
			
		||||
                                   w, h);
 | 
			
		||||
      break;
 | 
			
		||||
    case 64:
 | 
			
		||||
      prefetch_store(dst + 32);
 | 
			
		||||
      convolve_bi_avg_vert_64_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_y, h);
 | 
			
		||||
      convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
 | 
			
		||||
                                    h);
 | 
			
		||||
      break;
 | 
			
		||||
    default:
 | 
			
		||||
      vpx_convolve8_avg_vert_c(src, src_stride,
 | 
			
		||||
                               dst, dst_stride,
 | 
			
		||||
                               filter_x, x_step_q4,
 | 
			
		||||
                               filter_y, y_step_q4,
 | 
			
		||||
                               w, h);
 | 
			
		||||
      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                               x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
      break;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -19,11 +19,9 @@
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                          int32_t src_stride,
 | 
			
		||||
                                          uint8_t *dst,
 | 
			
		||||
                                          int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                          int32_t dst_stride,
 | 
			
		||||
                                          const int16_t *filter_x0,
 | 
			
		||||
                                          int32_t h) {
 | 
			
		||||
                                          const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  int32_t Temp1, Temp2, Temp3, Temp4;
 | 
			
		||||
@@ -98,14 +96,12 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t" /* average odd 2 */
 | 
			
		||||
        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
 | 
			
		||||
        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
 | 
			
		||||
          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
 | 
			
		||||
          [Temp4] "=&r"(Temp4)
 | 
			
		||||
        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
 | 
			
		||||
          [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -114,11 +110,9 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
                                         int32_t src_stride,
 | 
			
		||||
                                         uint8_t *dst,
 | 
			
		||||
                                          int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                          int32_t dst_stride,
 | 
			
		||||
                                         const int16_t *filter_x0,
 | 
			
		||||
                                         int32_t h) {
 | 
			
		||||
                                          const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  uint32_t vector4a = 64;
 | 
			
		||||
@@ -127,7 +121,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
  uint32_t p1, p2, p3, p4, n1;
 | 
			
		||||
  uint32_t st0, st1;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -246,15 +240,12 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
        "sb               %[tp4],         5(%[dst])                      \n\t"
 | 
			
		||||
        "sb               %[tp1],         7(%[dst])                      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
 | 
			
		||||
          [st0] "=&r" (st0), [st1] "=&r" (st1),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
          [n1] "=&r" (n1),
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
 | 
			
		||||
          [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
 | 
			
		||||
          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
 | 
			
		||||
          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
 | 
			
		||||
          [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -263,11 +254,9 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                          int32_t src_stride,
 | 
			
		||||
                                          uint8_t *dst_ptr,
 | 
			
		||||
                                           int32_t src_stride, uint8_t *dst_ptr,
 | 
			
		||||
                                           int32_t dst_stride,
 | 
			
		||||
                                          const int16_t *filter_x0,
 | 
			
		||||
                                          int32_t h,
 | 
			
		||||
                                           const int16_t *filter_x0, int32_t h,
 | 
			
		||||
                                           int32_t count) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
@@ -279,7 +268,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
  uint32_t p1, p2, p3, p4, p5;
 | 
			
		||||
  uint32_t st1, st2, st3;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -493,14 +482,13 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
 | 
			
		||||
          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
 | 
			
		||||
            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
 | 
			
		||||
            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
            [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
 | 
			
		||||
            [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -513,8 +501,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                          int32_t src_stride,
 | 
			
		||||
                                          uint8_t *dst_ptr,
 | 
			
		||||
                                           int32_t src_stride, uint8_t *dst_ptr,
 | 
			
		||||
                                           int32_t dst_stride,
 | 
			
		||||
                                           const int16_t *filter_x0,
 | 
			
		||||
                                           int32_t h) {
 | 
			
		||||
@@ -528,7 +515,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
  uint32_t p1, p2, p3, p4, p5;
 | 
			
		||||
  uint32_t st1, st2, st3;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -744,14 +731,13 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
 | 
			
		||||
          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
 | 
			
		||||
            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
 | 
			
		||||
            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
            [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
 | 
			
		||||
            [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -773,11 +759,9 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  /* prefetch data to cache memory */
 | 
			
		||||
  prefetch_load(src);
 | 
			
		||||
@@ -786,39 +770,31 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
 | 
			
		||||
  switch (w) {
 | 
			
		||||
    case 4:
 | 
			
		||||
      convolve_bi_avg_horiz_4_dspr2(src, src_stride,
 | 
			
		||||
                                   dst, dst_stride,
 | 
			
		||||
                                   filter_x, h);
 | 
			
		||||
      convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                    h);
 | 
			
		||||
      break;
 | 
			
		||||
    case 8:
 | 
			
		||||
      convolve_bi_avg_horiz_8_dspr2(src, src_stride,
 | 
			
		||||
                                   dst, dst_stride,
 | 
			
		||||
                                   filter_x, h);
 | 
			
		||||
      convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                    h);
 | 
			
		||||
      break;
 | 
			
		||||
    case 16:
 | 
			
		||||
      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_x, h, 1);
 | 
			
		||||
      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                     h, 1);
 | 
			
		||||
      break;
 | 
			
		||||
    case 32:
 | 
			
		||||
      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_x, h, 2);
 | 
			
		||||
      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                     h, 2);
 | 
			
		||||
      break;
 | 
			
		||||
    case 64:
 | 
			
		||||
      prefetch_load(src + 64);
 | 
			
		||||
      prefetch_store(dst + 32);
 | 
			
		||||
 | 
			
		||||
      convolve_bi_avg_horiz_64_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_x, h);
 | 
			
		||||
      convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                     h);
 | 
			
		||||
      break;
 | 
			
		||||
    default:
 | 
			
		||||
      vpx_convolve8_avg_horiz_c(src, src_stride,
 | 
			
		||||
                                dst, dst_stride,
 | 
			
		||||
                                filter_x, x_step_q4,
 | 
			
		||||
                                filter_y, y_step_q4,
 | 
			
		||||
                                w, h);
 | 
			
		||||
      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
      break;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                      int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst,
 | 
			
		||||
                                      int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_x0,
 | 
			
		||||
                                      int32_t h) {
 | 
			
		||||
static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  int32_t Temp1, Temp2, Temp3, Temp4;
 | 
			
		||||
@@ -31,7 +28,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
  uint32_t tp1, tp2;
 | 
			
		||||
  uint32_t p1, p2;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -86,13 +83,11 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
        "sb               %[tp2],      2(%[dst])                      \n\t"
 | 
			
		||||
        "sb               %[p2],       3(%[dst])                      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
 | 
			
		||||
        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
 | 
			
		||||
          [Temp4] "=&r"(Temp4)
 | 
			
		||||
        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
 | 
			
		||||
          [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -100,12 +95,9 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
                                      int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst,
 | 
			
		||||
                                      int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_x0,
 | 
			
		||||
                                      int32_t h) {
 | 
			
		||||
static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  uint32_t vector4a = 64;
 | 
			
		||||
@@ -114,7 +106,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
  uint32_t p1, p2, p3, p4;
 | 
			
		||||
  uint32_t st0, st1;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -211,12 +203,11 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
        "sb               %[p1],       7(%[dst])                      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
 | 
			
		||||
          [st0] "=&r" (st0), [st1] "=&r" (st1),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
 | 
			
		||||
          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
 | 
			
		||||
          [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -225,11 +216,9 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                       int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst_ptr,
 | 
			
		||||
                                       int32_t src_stride, uint8_t *dst_ptr,
 | 
			
		||||
                                       int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_x0,
 | 
			
		||||
                                       int32_t h,
 | 
			
		||||
                                       const int16_t *filter_x0, int32_t h,
 | 
			
		||||
                                       int32_t count) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
@@ -241,7 +230,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
  uint32_t p1, p2, p3, p4, p5;
 | 
			
		||||
  uint32_t st1, st2, st3;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -413,14 +402,13 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
 | 
			
		||||
          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
 | 
			
		||||
            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
 | 
			
		||||
            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
 | 
			
		||||
            [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -433,11 +421,9 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                       int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst_ptr,
 | 
			
		||||
                                       int32_t src_stride, uint8_t *dst_ptr,
 | 
			
		||||
                                       int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_x0,
 | 
			
		||||
                                       int32_t h) {
 | 
			
		||||
                                       const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
  uint8_t *dst;
 | 
			
		||||
@@ -448,7 +434,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
  uint32_t p1, p2, p3, p4, p5;
 | 
			
		||||
  uint32_t st1, st2, st3;
 | 
			
		||||
  const int16_t *filter = &filter_x0[3];
 | 
			
		||||
  uint32_t filter45;;
 | 
			
		||||
  uint32_t filter45;
 | 
			
		||||
 | 
			
		||||
  filter45 = ((const int32_t *)filter)[0];
 | 
			
		||||
 | 
			
		||||
@@ -622,14 +608,13 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
 | 
			
		||||
          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
 | 
			
		||||
            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
 | 
			
		||||
            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
 | 
			
		||||
            [dst] "r"(dst), [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -644,8 +629,8 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                               uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                               const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                               const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                               int w, int h) {
 | 
			
		||||
                               const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                               int h) {
 | 
			
		||||
  uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
@@ -653,11 +638,9 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  prefetch_load((const uint8_t *)filter_x);
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  /* prefetch data to cache memory */
 | 
			
		||||
  prefetch_load(src);
 | 
			
		||||
@@ -666,39 +649,31 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
 | 
			
		||||
  switch (w) {
 | 
			
		||||
    case 4:
 | 
			
		||||
      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                dst, (int32_t)dst_stride,
 | 
			
		||||
                                filter_x, (int32_t)h);
 | 
			
		||||
      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                (int32_t)dst_stride, filter_x, (int32_t)h);
 | 
			
		||||
      break;
 | 
			
		||||
    case 8:
 | 
			
		||||
      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                dst, (int32_t)dst_stride,
 | 
			
		||||
                                filter_x, (int32_t)h);
 | 
			
		||||
      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                (int32_t)dst_stride, filter_x, (int32_t)h);
 | 
			
		||||
      break;
 | 
			
		||||
    case 16:
 | 
			
		||||
      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 filter_x, (int32_t)h, 1);
 | 
			
		||||
      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
 | 
			
		||||
      break;
 | 
			
		||||
    case 32:
 | 
			
		||||
      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 filter_x, (int32_t)h, 2);
 | 
			
		||||
      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
 | 
			
		||||
      break;
 | 
			
		||||
    case 64:
 | 
			
		||||
      prefetch_load(src + 64);
 | 
			
		||||
      prefetch_store(dst + 32);
 | 
			
		||||
 | 
			
		||||
      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 filter_x, (int32_t)h);
 | 
			
		||||
      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, filter_x, (int32_t)h);
 | 
			
		||||
      break;
 | 
			
		||||
    default:
 | 
			
		||||
      vpx_convolve8_horiz_c(src, src_stride,
 | 
			
		||||
                            dst, dst_stride,
 | 
			
		||||
                            filter_x, x_step_q4,
 | 
			
		||||
                            filter_y, y_step_q4,
 | 
			
		||||
                            w, h);
 | 
			
		||||
      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                            x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
      break;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_bi_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                     int32_t src_stride,
 | 
			
		||||
                                     uint8_t *dst,
 | 
			
		||||
                                     int32_t dst_stride,
 | 
			
		||||
                                     const int16_t *filter_y,
 | 
			
		||||
                                     int32_t w,
 | 
			
		||||
static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                     uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                     const int16_t *filter_y, int32_t w,
 | 
			
		||||
                                     int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
@@ -98,16 +95,12 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
 | 
			
		||||
            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
 | 
			
		||||
            [src_stride] "r" (src_stride),
 | 
			
		||||
            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -116,12 +109,9 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_bi_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
                                      int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst,
 | 
			
		||||
                                      int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_y,
 | 
			
		||||
                                      int32_t h) {
 | 
			
		||||
static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_y, int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
  uint8_t *dst_ptr;
 | 
			
		||||
@@ -195,16 +185,12 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
 | 
			
		||||
            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
 | 
			
		||||
            [src_stride] "r" (src_stride),
 | 
			
		||||
            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -216,18 +202,16 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                              uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                              const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                              const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                              int w, int h) {
 | 
			
		||||
                              const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                              int h) {
 | 
			
		||||
  uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  prefetch_store(dst);
 | 
			
		||||
 | 
			
		||||
@@ -236,22 +220,16 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
    case 8:
 | 
			
		||||
    case 16:
 | 
			
		||||
    case 32:
 | 
			
		||||
      convolve_bi_vert_4_dspr2(src, src_stride,
 | 
			
		||||
                               dst, dst_stride,
 | 
			
		||||
                               filter_y, w, h);
 | 
			
		||||
      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
 | 
			
		||||
                               h);
 | 
			
		||||
      break;
 | 
			
		||||
    case 64:
 | 
			
		||||
      prefetch_store(dst + 32);
 | 
			
		||||
      convolve_bi_vert_64_dspr2(src, src_stride,
 | 
			
		||||
                                dst, dst_stride,
 | 
			
		||||
                                filter_y, h);
 | 
			
		||||
      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
 | 
			
		||||
      break;
 | 
			
		||||
    default:
 | 
			
		||||
      vpx_convolve8_vert_c(src, src_stride,
 | 
			
		||||
                           dst, dst_stride,
 | 
			
		||||
                           filter_x, x_step_q4,
 | 
			
		||||
                           filter_y, y_step_q4,
 | 
			
		||||
                           w, h);
 | 
			
		||||
      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                           x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
      break;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_avg_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                      int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst,
 | 
			
		||||
                                      int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_y,
 | 
			
		||||
                                      int32_t w,
 | 
			
		||||
static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                      uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                      const int16_t *filter_y, int32_t w,
 | 
			
		||||
                                      int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
@@ -160,18 +157,16 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
 | 
			
		||||
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
            [vector4a] "r" (vector4a),
 | 
			
		||||
            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
 | 
			
		||||
            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -180,12 +175,9 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
                                       int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_y,
 | 
			
		||||
                                       int32_t h) {
 | 
			
		||||
static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_y, int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
  uint8_t *dst_ptr;
 | 
			
		||||
@@ -322,18 +314,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
 | 
			
		||||
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
            [vector4a] "r" (vector4a),
 | 
			
		||||
            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
 | 
			
		||||
            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -345,26 +335,21 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                  uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                  const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                                  const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                                  int w, int h) {
 | 
			
		||||
                                  const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                                  int h) {
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
  assert(((const int32_t *)filter_y)[1] != 0x800000);
 | 
			
		||||
 | 
			
		||||
  if (((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    vpx_convolve2_avg_vert_dspr2(src, src_stride,
 | 
			
		||||
                                 dst, dst_stride,
 | 
			
		||||
                                 filter_x, x_step_q4,
 | 
			
		||||
                                 filter_y, y_step_q4,
 | 
			
		||||
                                 w, h);
 | 
			
		||||
    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                 x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
  } else {
 | 
			
		||||
    uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
    /* bit positon for extract from acc */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
      "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                         :
 | 
			
		||||
      : [pos] "r" (pos)
 | 
			
		||||
    );
 | 
			
		||||
                         : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
    prefetch_store(dst);
 | 
			
		||||
 | 
			
		||||
@@ -373,22 +358,17 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
      case 8:
 | 
			
		||||
      case 16:
 | 
			
		||||
      case 32:
 | 
			
		||||
        convolve_avg_vert_4_dspr2(src, src_stride,
 | 
			
		||||
                                  dst, dst_stride,
 | 
			
		||||
                                  filter_y, w, h);
 | 
			
		||||
        convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
 | 
			
		||||
                                  h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        prefetch_store(dst + 32);
 | 
			
		||||
        convolve_avg_vert_64_dspr2(src, src_stride,
 | 
			
		||||
                                   dst, dst_stride,
 | 
			
		||||
                                   filter_y, h);
 | 
			
		||||
        convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
 | 
			
		||||
                                   h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_vert_c(src, src_stride,
 | 
			
		||||
                                 dst, dst_stride,
 | 
			
		||||
                                 filter_x, x_step_q4,
 | 
			
		||||
                                 filter_y, y_step_q4,
 | 
			
		||||
                                 w, h);
 | 
			
		||||
        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                 x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -397,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                             uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                             const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                             const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                             int w, int h) {
 | 
			
		||||
                             const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                             int h) {
 | 
			
		||||
  /* Fixed size intermediate buffer places limits on parameters. */
 | 
			
		||||
  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
 | 
			
		||||
  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
 | 
			
		||||
@@ -408,27 +388,20 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
 | 
			
		||||
  if (intermediate_height < h)
 | 
			
		||||
    intermediate_height = h;
 | 
			
		||||
  if (intermediate_height < h) intermediate_height = h;
 | 
			
		||||
 | 
			
		||||
  vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
 | 
			
		||||
                      temp, 64,
 | 
			
		||||
                      filter_x, x_step_q4,
 | 
			
		||||
                      filter_y, y_step_q4,
 | 
			
		||||
                      w, intermediate_height);
 | 
			
		||||
  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
 | 
			
		||||
                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
 | 
			
		||||
 | 
			
		||||
  vpx_convolve8_avg_vert(temp + 64 * 3, 64,
 | 
			
		||||
                         dst, dst_stride,
 | 
			
		||||
                         filter_x, x_step_q4,
 | 
			
		||||
                         filter_y, y_step_q4,
 | 
			
		||||
                         w, h);
 | 
			
		||||
  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
 | 
			
		||||
                         x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                            uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                            const int16_t *filter_x, int filter_x_stride,
 | 
			
		||||
                            const int16_t *filter_y, int filter_y_stride,
 | 
			
		||||
                            int w, int h) {
 | 
			
		||||
                            const int16_t *filter_y, int filter_y_stride, int w,
 | 
			
		||||
                            int h) {
 | 
			
		||||
  int x, y;
 | 
			
		||||
  uint32_t tp1, tp2, tn1;
 | 
			
		||||
  uint32_t tp3, tp4, tn2;
 | 
			
		||||
@@ -452,10 +425,8 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
 | 
			
		||||
            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
 | 
			
		||||
 | 
			
		||||
            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
 | 
			
		||||
              [tp2] "=&r" (tp2)
 | 
			
		||||
            : [src] "r" (src), [dst] "r" (dst)
 | 
			
		||||
        );
 | 
			
		||||
            : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
 | 
			
		||||
            : [src] "r"(src), [dst] "r"(dst));
 | 
			
		||||
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        dst += dst_stride;
 | 
			
		||||
@@ -478,11 +449,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
 | 
			
		||||
            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
 | 
			
		||||
 | 
			
		||||
            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
 | 
			
		||||
              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
 | 
			
		||||
            : [src] "r" (src), [dst] "r" (dst)
 | 
			
		||||
        );
 | 
			
		||||
            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
 | 
			
		||||
              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
 | 
			
		||||
            : [src] "r"(src), [dst] "r"(dst));
 | 
			
		||||
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        dst += dst_stride;
 | 
			
		||||
@@ -513,11 +482,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
 | 
			
		||||
            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
 | 
			
		||||
 | 
			
		||||
            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
 | 
			
		||||
              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
 | 
			
		||||
            : [src] "r" (src), [dst] "r" (dst)
 | 
			
		||||
        );
 | 
			
		||||
            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
 | 
			
		||||
              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
 | 
			
		||||
            : [src] "r"(src), [dst] "r"(dst));
 | 
			
		||||
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        dst += dst_stride;
 | 
			
		||||
@@ -564,11 +531,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
 | 
			
		||||
            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
 | 
			
		||||
 | 
			
		||||
            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
 | 
			
		||||
              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
 | 
			
		||||
            : [src] "r" (src), [dst] "r" (dst)
 | 
			
		||||
        );
 | 
			
		||||
            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
 | 
			
		||||
              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
 | 
			
		||||
            : [src] "r"(src), [dst] "r"(dst));
 | 
			
		||||
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        dst += dst_stride;
 | 
			
		||||
@@ -652,11 +617,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
 | 
			
		||||
            "sw               %[tn2],         60(%[dst])     \n\t" /* store */
 | 
			
		||||
 | 
			
		||||
            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
 | 
			
		||||
              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
 | 
			
		||||
            : [src] "r" (src), [dst] "r" (dst)
 | 
			
		||||
        );
 | 
			
		||||
            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
 | 
			
		||||
              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
 | 
			
		||||
            : [src] "r"(src), [dst] "r"(dst));
 | 
			
		||||
 | 
			
		||||
        src += src_stride;
 | 
			
		||||
        dst += dst_stride;
 | 
			
		||||
 
 | 
			
		||||
@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                       int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_x0,
 | 
			
		||||
                                       int32_t h) {
 | 
			
		||||
static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  int32_t vector1b, vector2b, vector3b, vector4b;
 | 
			
		||||
@@ -122,17 +119,15 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t" /* average odd 2 */
 | 
			
		||||
        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
 | 
			
		||||
          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
 | 
			
		||||
          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
 | 
			
		||||
        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
          [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
          [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -140,12 +135,9 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
                                       int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_x0,
 | 
			
		||||
                                       int32_t h) {
 | 
			
		||||
static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                       const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  uint32_t vector4a = 64;
 | 
			
		||||
@@ -309,17 +301,15 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
        "sb               %[tn3],         5(%[dst])                      \n\t"
 | 
			
		||||
        "sb               %[tn1],         7(%[dst])                      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
 | 
			
		||||
          [st0] "=&r" (st0), [st1] "=&r" (st1),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
          [n1] "=&r" (n1),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
 | 
			
		||||
          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
 | 
			
		||||
          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
 | 
			
		||||
          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
          [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
          [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -328,11 +318,9 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                        int32_t src_stride,
 | 
			
		||||
                                        uint8_t *dst_ptr,
 | 
			
		||||
                                        int32_t src_stride, uint8_t *dst_ptr,
 | 
			
		||||
                                        int32_t dst_stride,
 | 
			
		||||
                                        const int16_t *filter_x0,
 | 
			
		||||
                                        int32_t h,
 | 
			
		||||
                                        const int16_t *filter_x0, int32_t h,
 | 
			
		||||
                                        int32_t count) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
@@ -618,16 +606,15 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
 | 
			
		||||
          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
 | 
			
		||||
            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
 | 
			
		||||
            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
            [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter12] "r"(filter12), [filter34] "r"(filter34),
 | 
			
		||||
            [filter56] "r"(filter56), [filter78] "r"(filter78),
 | 
			
		||||
            [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
            [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -640,11 +627,9 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                        int32_t src_stride,
 | 
			
		||||
                                        uint8_t *dst_ptr,
 | 
			
		||||
                                        int32_t src_stride, uint8_t *dst_ptr,
 | 
			
		||||
                                        int32_t dst_stride,
 | 
			
		||||
                                        const int16_t *filter_x0,
 | 
			
		||||
                                        int32_t h) {
 | 
			
		||||
                                        const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
  uint8_t *dst;
 | 
			
		||||
@@ -931,16 +916,15 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
 | 
			
		||||
          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
 | 
			
		||||
            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
 | 
			
		||||
            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
            [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter12] "r"(filter12), [filter34] "r"(filter34),
 | 
			
		||||
            [filter56] "r"(filter56), [filter78] "r"(filter78),
 | 
			
		||||
            [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
            [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -961,22 +945,17 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  assert(((const int32_t *)filter_x)[1] != 0x800000);
 | 
			
		||||
 | 
			
		||||
  if (((const int32_t *)filter_x)[0] == 0) {
 | 
			
		||||
    vpx_convolve2_avg_horiz_dspr2(src, src_stride,
 | 
			
		||||
                                  dst, dst_stride,
 | 
			
		||||
                                  filter_x, x_step_q4,
 | 
			
		||||
                                  filter_y, y_step_q4,
 | 
			
		||||
                                  w, h);
 | 
			
		||||
    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                  x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
  } else {
 | 
			
		||||
    uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
    src -= 3;
 | 
			
		||||
 | 
			
		||||
    /* bit positon for extract from acc */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
      "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                         :
 | 
			
		||||
      : [pos] "r" (pos)
 | 
			
		||||
    );
 | 
			
		||||
                         : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
    /* prefetch data to cache memory */
 | 
			
		||||
    prefetch_load(src);
 | 
			
		||||
@@ -985,39 +964,32 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        convolve_avg_horiz_4_dspr2(src, src_stride,
 | 
			
		||||
                                   dst, dst_stride,
 | 
			
		||||
                                   filter_x, h);
 | 
			
		||||
        convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                   h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        convolve_avg_horiz_8_dspr2(src, src_stride,
 | 
			
		||||
                                   dst, dst_stride,
 | 
			
		||||
                                   filter_x, h);
 | 
			
		||||
        convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                   h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        convolve_avg_horiz_16_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_x, h, 1);
 | 
			
		||||
        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                    h, 1);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        convolve_avg_horiz_16_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_x, h, 2);
 | 
			
		||||
        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                    h, 2);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        prefetch_load(src + 64);
 | 
			
		||||
        prefetch_store(dst + 32);
 | 
			
		||||
 | 
			
		||||
        convolve_avg_horiz_64_dspr2(src, src_stride,
 | 
			
		||||
                                    dst, dst_stride,
 | 
			
		||||
                                    filter_x, h);
 | 
			
		||||
        convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                    h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_horiz_c(src + 3, src_stride,
 | 
			
		||||
                                  dst, dst_stride,
 | 
			
		||||
                                  filter_x, x_step_q4,
 | 
			
		||||
                                  filter_y, y_step_q4,
 | 
			
		||||
                                  w, h);
 | 
			
		||||
        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
 | 
			
		||||
                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
 | 
			
		||||
                                  h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                   int32_t src_stride,
 | 
			
		||||
                                   uint8_t *dst,
 | 
			
		||||
                                   int32_t dst_stride,
 | 
			
		||||
                                   const int16_t *filter_x0,
 | 
			
		||||
                                   int32_t h) {
 | 
			
		||||
static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                   uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                   const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  int32_t vector1b, vector2b, vector3b, vector4b;
 | 
			
		||||
@@ -111,17 +108,15 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
        "sb               %[tp2],      2(%[dst])                      \n\t"
 | 
			
		||||
        "sb               %[n2],       3(%[dst])                      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
 | 
			
		||||
          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
 | 
			
		||||
          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
 | 
			
		||||
        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
          [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
          [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -129,12 +124,9 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
                                   int32_t src_stride,
 | 
			
		||||
                                   uint8_t *dst,
 | 
			
		||||
                                   int32_t dst_stride,
 | 
			
		||||
                                   const int16_t *filter_x0,
 | 
			
		||||
                                   int32_t h) {
 | 
			
		||||
static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                   uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                   const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y;
 | 
			
		||||
  uint8_t *cm = vpx_ff_cropTbl;
 | 
			
		||||
  uint32_t vector4a = 64;
 | 
			
		||||
@@ -275,17 +267,15 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
        "sb               %[p2],       5(%[dst])                      \n\t"
 | 
			
		||||
        "sb               %[n1],       7(%[dst])                      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
 | 
			
		||||
          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
 | 
			
		||||
          [st0] "=&r" (st0), [st1] "=&r" (st1),
 | 
			
		||||
          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
          [n1] "=&r" (n1),
 | 
			
		||||
          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
 | 
			
		||||
          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
 | 
			
		||||
          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
 | 
			
		||||
          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
          [vector4a] "r" (vector4a),
 | 
			
		||||
          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
 | 
			
		||||
    );
 | 
			
		||||
          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
          [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
    src += src_stride;
 | 
			
		||||
@@ -293,12 +283,9 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                    int32_t src_stride,
 | 
			
		||||
                                    uint8_t *dst_ptr,
 | 
			
		||||
                                    int32_t dst_stride,
 | 
			
		||||
                                    const int16_t *filter_x0,
 | 
			
		||||
                                    int32_t h,
 | 
			
		||||
static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
                                    uint8_t *dst_ptr, int32_t dst_stride,
 | 
			
		||||
                                    const int16_t *filter_x0, int32_t h,
 | 
			
		||||
                                    int32_t count) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
@@ -542,17 +529,15 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
 | 
			
		||||
          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
 | 
			
		||||
            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
 | 
			
		||||
            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter12] "r"(filter12), [filter34] "r"(filter34),
 | 
			
		||||
            [filter56] "r"(filter56), [filter78] "r"(filter78),
 | 
			
		||||
            [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst),
 | 
			
		||||
            [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
            [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -564,12 +549,9 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
                                    int32_t src_stride,
 | 
			
		||||
                                    uint8_t *dst_ptr,
 | 
			
		||||
                                    int32_t dst_stride,
 | 
			
		||||
                                    const int16_t *filter_x0,
 | 
			
		||||
                                    int32_t h) {
 | 
			
		||||
static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
                                    uint8_t *dst_ptr, int32_t dst_stride,
 | 
			
		||||
                                    const int16_t *filter_x0, int32_t h) {
 | 
			
		||||
  int32_t y, c;
 | 
			
		||||
  const uint8_t *src;
 | 
			
		||||
  uint8_t *dst;
 | 
			
		||||
@@ -814,17 +796,15 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
 | 
			
		||||
          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 | 
			
		||||
 | 
			
		||||
          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
 | 
			
		||||
            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
 | 
			
		||||
            [p5] "=&r" (p5),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
 | 
			
		||||
          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
 | 
			
		||||
            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
 | 
			
		||||
            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
 | 
			
		||||
            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
 | 
			
		||||
          : [filter12] "r"(filter12), [filter34] "r"(filter34),
 | 
			
		||||
            [filter56] "r"(filter56), [filter78] "r"(filter78),
 | 
			
		||||
            [vector_64] "r" (vector_64),
 | 
			
		||||
            [cm] "r" (cm), [dst] "r" (dst),
 | 
			
		||||
            [src] "r" (src)
 | 
			
		||||
      );
 | 
			
		||||
            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
 | 
			
		||||
            [src] "r"(src));
 | 
			
		||||
 | 
			
		||||
      src += 16;
 | 
			
		||||
      dst += 16;
 | 
			
		||||
@@ -839,17 +819,14 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
 | 
			
		||||
void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                               uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                               const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                               const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                               int w, int h) {
 | 
			
		||||
                               const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                               int h) {
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
  assert(((const int32_t *)filter_x)[1] != 0x800000);
 | 
			
		||||
 | 
			
		||||
  if (((const int32_t *)filter_x)[0] == 0) {
 | 
			
		||||
    vpx_convolve2_horiz_dspr2(src, src_stride,
 | 
			
		||||
                              dst, dst_stride,
 | 
			
		||||
                              filter_x, x_step_q4,
 | 
			
		||||
                              filter_y, y_step_q4,
 | 
			
		||||
                              w, h);
 | 
			
		||||
    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                              x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
  } else {
 | 
			
		||||
    uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
@@ -857,11 +834,9 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
    src -= 3;
 | 
			
		||||
 | 
			
		||||
    /* bit positon for extract from acc */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
      "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                         :
 | 
			
		||||
      : [pos] "r" (pos)
 | 
			
		||||
    );
 | 
			
		||||
                         : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
    /* prefetch data to cache memory */
 | 
			
		||||
    prefetch_load(src);
 | 
			
		||||
@@ -870,39 +845,31 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        convolve_horiz_4_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                               dst, (int32_t)dst_stride,
 | 
			
		||||
                               filter_x, (int32_t)h);
 | 
			
		||||
        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                               (int32_t)dst_stride, filter_x, (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        convolve_horiz_8_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                               dst, (int32_t)dst_stride,
 | 
			
		||||
                               filter_x, (int32_t)h);
 | 
			
		||||
        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                               (int32_t)dst_stride, filter_x, (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                dst, (int32_t)dst_stride,
 | 
			
		||||
                                filter_x, (int32_t)h, 1);
 | 
			
		||||
        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                dst, (int32_t)dst_stride,
 | 
			
		||||
                                filter_x, (int32_t)h, 2);
 | 
			
		||||
        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        prefetch_load(src + 64);
 | 
			
		||||
        prefetch_store(dst + 32);
 | 
			
		||||
 | 
			
		||||
        convolve_horiz_64_dspr2(src, (int32_t)src_stride,
 | 
			
		||||
                                dst, (int32_t)dst_stride,
 | 
			
		||||
                                filter_x, (int32_t)h);
 | 
			
		||||
        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                (int32_t)dst_stride, filter_x, (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_horiz_c(src + 3, src_stride,
 | 
			
		||||
                              dst, dst_stride,
 | 
			
		||||
                              filter_x, x_step_q4,
 | 
			
		||||
                              filter_y, y_step_q4,
 | 
			
		||||
                              w, h);
 | 
			
		||||
        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                              x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -18,12 +18,9 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void convolve_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
                                  int32_t src_stride,
 | 
			
		||||
                                  uint8_t *dst,
 | 
			
		||||
                                  int32_t dst_stride,
 | 
			
		||||
                                  const int16_t *filter_y,
 | 
			
		||||
                                  int32_t w,
 | 
			
		||||
static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                  uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                  const int16_t *filter_y, int32_t w,
 | 
			
		||||
                                  int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
@@ -152,19 +149,16 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
            [n1] "=&r" (n1), [n2] "=&r" (n2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
 | 
			
		||||
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
 | 
			
		||||
            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -173,12 +167,9 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void convolve_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
                                   int32_t src_stride,
 | 
			
		||||
                                   uint8_t *dst,
 | 
			
		||||
                                   int32_t dst_stride,
 | 
			
		||||
                                   const int16_t *filter_y,
 | 
			
		||||
                                   int32_t h) {
 | 
			
		||||
static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                   uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
                                   const int16_t *filter_y, int32_t h) {
 | 
			
		||||
  int32_t x, y;
 | 
			
		||||
  const uint8_t *src_ptr;
 | 
			
		||||
  uint8_t *dst_ptr;
 | 
			
		||||
@@ -307,19 +298,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
 | 
			
		||||
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 | 
			
		||||
 | 
			
		||||
          : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
            [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
            [p1] "=&r" (p1), [p2] "=&r" (p2),
 | 
			
		||||
            [n1] "=&r" (n1), [n2] "=&r" (n2),
 | 
			
		||||
            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
 | 
			
		||||
            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
 | 
			
		||||
            [store1] "=&r" (store1), [store2] "=&r" (store2),
 | 
			
		||||
            [src_ptr] "+r" (src_ptr)
 | 
			
		||||
          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
 | 
			
		||||
            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
 | 
			
		||||
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
 | 
			
		||||
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
 | 
			
		||||
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
 | 
			
		||||
          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
 | 
			
		||||
            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
 | 
			
		||||
            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
 | 
			
		||||
            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
 | 
			
		||||
      );
 | 
			
		||||
            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Next row... */
 | 
			
		||||
@@ -331,26 +319,21 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
 | 
			
		||||
void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                              uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                              const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                              const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                              int w, int h) {
 | 
			
		||||
                              const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                              int h) {
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
  assert(((const int32_t *)filter_y)[1] != 0x800000);
 | 
			
		||||
 | 
			
		||||
  if (((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    vpx_convolve2_vert_dspr2(src, src_stride,
 | 
			
		||||
                             dst, dst_stride,
 | 
			
		||||
                             filter_x, x_step_q4,
 | 
			
		||||
                             filter_y, y_step_q4,
 | 
			
		||||
                             w, h);
 | 
			
		||||
    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                             x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
  } else {
 | 
			
		||||
    uint32_t pos = 38;
 | 
			
		||||
 | 
			
		||||
    /* bit positon for extract from acc */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
      "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                         :
 | 
			
		||||
      : [pos] "r" (pos)
 | 
			
		||||
    );
 | 
			
		||||
                         : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
    prefetch_store(dst);
 | 
			
		||||
 | 
			
		||||
@@ -359,22 +342,15 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
      case 8:
 | 
			
		||||
      case 16:
 | 
			
		||||
      case 32:
 | 
			
		||||
        convolve_vert_4_dspr2(src, src_stride,
 | 
			
		||||
                              dst, dst_stride,
 | 
			
		||||
                              filter_y, w, h);
 | 
			
		||||
        convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        prefetch_store(dst + 32);
 | 
			
		||||
        convolve_vert_64_dspr2(src, src_stride,
 | 
			
		||||
                               dst, dst_stride,
 | 
			
		||||
                               filter_y, h);
 | 
			
		||||
        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_vert_c(src, src_stride,
 | 
			
		||||
                             dst, dst_stride,
 | 
			
		||||
                             filter_x, x_step_q4,
 | 
			
		||||
                             filter_y, y_step_q4,
 | 
			
		||||
                             w, h);
 | 
			
		||||
        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                             x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -25,8 +25,8 @@ extern "C" {
 | 
			
		||||
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                               uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                               const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                               const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                               int w, int h);
 | 
			
		||||
                               const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                               int h);
 | 
			
		||||
 | 
			
		||||
void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                   uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
@@ -37,19 +37,18 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                  uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                  const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                                  const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                                  int w, int h);
 | 
			
		||||
                                  const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                                  int h);
 | 
			
		||||
 | 
			
		||||
void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                         uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                         const int16_t *filter,
 | 
			
		||||
                         int w, int h);
 | 
			
		||||
void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 | 
			
		||||
                         ptrdiff_t dst_stride, const int16_t *filter, int w,
 | 
			
		||||
                         int h);
 | 
			
		||||
 | 
			
		||||
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                              uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                              const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                              const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                              int w, int h);
 | 
			
		||||
                              const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                              int h);
 | 
			
		||||
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
#ifdef __cplusplus
 | 
			
		||||
 
 | 
			
		||||
@@ -13,23 +13,22 @@
 | 
			
		||||
 | 
			
		||||
extern const int16_t vpx_rv[];
 | 
			
		||||
 | 
			
		||||
#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
 | 
			
		||||
                                out0, out1, out2, out3,                  \
 | 
			
		||||
                                out4, out5, out6, out7,                  \
 | 
			
		||||
                                out8, out9, out10, out11,                \
 | 
			
		||||
                                out12, out13, out14, out15)              \
 | 
			
		||||
#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
 | 
			
		||||
                                out1, out2, out3, out4, out5, out6, out7,      \
 | 
			
		||||
                                out8, out9, out10, out11, out12, out13, out14, \
 | 
			
		||||
                                out15)                                         \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
 | 
			
		||||
    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
 | 
			
		||||
               temp0, temp1, temp2, temp3);                              \
 | 
			
		||||
    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
 | 
			
		||||
               temp3);                                                         \
 | 
			
		||||
    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
 | 
			
		||||
    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
 | 
			
		||||
    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
 | 
			
		||||
    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
 | 
			
		||||
    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
 | 
			
		||||
               temp0, temp1, temp2, temp3);                              \
 | 
			
		||||
    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
 | 
			
		||||
               temp3);                                                         \
 | 
			
		||||
    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
 | 
			
		||||
    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
 | 
			
		||||
    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
 | 
			
		||||
@@ -48,8 +47,8 @@ extern const int16_t vpx_rv[];
 | 
			
		||||
    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in,    \
 | 
			
		||||
                           below1_in, below2_in, ref, out)  \
 | 
			
		||||
#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
 | 
			
		||||
                           ref, out)                                           \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    v16u8 temp0, temp1;                                                        \
 | 
			
		||||
                                                                               \
 | 
			
		||||
@@ -71,8 +70,8 @@ extern const int16_t vpx_rv[];
 | 
			
		||||
    out = __msa_bmz_v(out, src_in, temp0);                                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7,        \
 | 
			
		||||
                         in8, in9, in10, in11, in12, in13, in14, in15)  \
 | 
			
		||||
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,    \
 | 
			
		||||
                         in10, in11, in12, in13, in14, in15)                  \
 | 
			
		||||
  {                                                                           \
 | 
			
		||||
    v8i16 temp0, temp1, temp2, temp3, temp4;                                  \
 | 
			
		||||
    v8i16 temp5, temp6, temp7, temp8, temp9;                                  \
 | 
			
		||||
@@ -98,10 +97,10 @@ extern const int16_t vpx_rv[];
 | 
			
		||||
    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                         \
 | 
			
		||||
    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);                    \
 | 
			
		||||
    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);                    \
 | 
			
		||||
    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14,            \
 | 
			
		||||
               temp2, temp3, temp4, temp5);                             \
 | 
			
		||||
    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4,  \
 | 
			
		||||
               temp6, temp7, temp8, temp9);                             \
 | 
			
		||||
    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3,    \
 | 
			
		||||
               temp4, temp5);                                                 \
 | 
			
		||||
    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
 | 
			
		||||
               temp7, temp8, temp9);                                          \
 | 
			
		||||
    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);                     \
 | 
			
		||||
    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);                    \
 | 
			
		||||
    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);                    \
 | 
			
		||||
@@ -110,8 +109,8 @@ extern const int16_t vpx_rv[];
 | 
			
		||||
    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);                   \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5,    \
 | 
			
		||||
                                in6, in7, in8, in9, in10, in11)  \
 | 
			
		||||
#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
 | 
			
		||||
                                in9, in10, in11)                             \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v8i16 temp0, temp1, temp2, temp3;                                        \
 | 
			
		||||
    v8i16 temp4, temp5, temp6, temp7;                                        \
 | 
			
		||||
 
 | 
			
		||||
@@ -27,10 +27,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
 | 
			
		||||
  SLLI_4V(in4, in5, in6, in7, 2);
 | 
			
		||||
  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
 | 
			
		||||
  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
              step0, step1, step2, step3, in4, in5, in6, in7);
 | 
			
		||||
  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
 | 
			
		||||
              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
 | 
			
		||||
              step3, in4, in5, in6, in7);
 | 
			
		||||
  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
 | 
			
		||||
              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
 | 
			
		||||
  ST_SH4(step0, step1, step2, step3, temp_buff, 8);
 | 
			
		||||
  ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
 | 
			
		||||
  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
 | 
			
		||||
@@ -45,10 +45,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
 | 
			
		||||
  SLLI_4V(in4, in5, in6, in7, 2);
 | 
			
		||||
  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
 | 
			
		||||
  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
              step0, step1, step2, step3, in4, in5, in6, in7);
 | 
			
		||||
  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
 | 
			
		||||
              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
 | 
			
		||||
              step3, in4, in5, in6, in7);
 | 
			
		||||
  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
 | 
			
		||||
              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
 | 
			
		||||
  ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
 | 
			
		||||
  ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
 | 
			
		||||
  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
 | 
			
		||||
@@ -64,12 +64,12 @@ static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
 | 
			
		||||
  /* fdct even */
 | 
			
		||||
  LD_SH4(input, 8, in0, in1, in2, in3);
 | 
			
		||||
  LD_SH4(input + 96, 8, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
 | 
			
		||||
              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
 | 
			
		||||
              vec3, in12, in13, in14, in15);
 | 
			
		||||
  LD_SH4(input + 32, 8, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH4(input + 64, 8, in8, in9, in10, in11);
 | 
			
		||||
  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
              vec4, vec5, vec6, vec7, in8, in9, in10, in11);
 | 
			
		||||
  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
 | 
			
		||||
              in8, in9, in10, in11);
 | 
			
		||||
 | 
			
		||||
  /* Stage 3 */
 | 
			
		||||
  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
 | 
			
		||||
@@ -258,28 +258,26 @@ static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
 | 
			
		||||
 | 
			
		||||
  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
                     in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
               step0, step1, step2, step3, step4, step5, step6, step7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
 | 
			
		||||
                     in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
 | 
			
		||||
               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
 | 
			
		||||
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
 | 
			
		||||
 | 
			
		||||
  /* 2nd set */
 | 
			
		||||
  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
                     in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
               step0, step1, step2, step3, step4, step5, step6, step7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
 | 
			
		||||
                     in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
 | 
			
		||||
               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
 | 
			
		||||
         (output + 8 * 8), 8);
 | 
			
		||||
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
 | 
			
		||||
@@ -299,10 +297,9 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
 | 
			
		||||
  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
 | 
			
		||||
               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
 | 
			
		||||
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
 | 
			
		||||
 | 
			
		||||
@@ -315,19 +312,19 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
 | 
			
		||||
  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
 | 
			
		||||
  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
 | 
			
		||||
  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
 | 
			
		||||
  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
 | 
			
		||||
       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
 | 
			
		||||
  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
 | 
			
		||||
       tmp1_w, tmp2_w, tmp3_w);
 | 
			
		||||
  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
 | 
			
		||||
  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
 | 
			
		||||
       vec0_r, vec1_r, vec2_r, vec3_r);
 | 
			
		||||
  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
 | 
			
		||||
       vec1_r, vec2_r, vec3_r);
 | 
			
		||||
 | 
			
		||||
  tmp3_w = vec0_r + vec3_r;
 | 
			
		||||
  vec0_r = vec0_r - vec3_r;
 | 
			
		||||
  vec3_r = vec1_r + vec2_r;
 | 
			
		||||
  vec1_r = vec1_r - vec2_r;
 | 
			
		||||
 | 
			
		||||
  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
 | 
			
		||||
                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
 | 
			
		||||
  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
 | 
			
		||||
                    vec4_r, tmp3_w, vec6_r, vec3_r);
 | 
			
		||||
  FDCT32_POSTPROC_NEG_W(vec4_r);
 | 
			
		||||
  FDCT32_POSTPROC_NEG_W(tmp3_w);
 | 
			
		||||
  FDCT32_POSTPROC_NEG_W(vec6_r);
 | 
			
		||||
@@ -335,8 +332,8 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
 | 
			
		||||
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
 | 
			
		||||
  ST_SH2(vec5, vec4, out, 8);
 | 
			
		||||
 | 
			
		||||
  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
 | 
			
		||||
                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
 | 
			
		||||
  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
 | 
			
		||||
                    vec4_r, tmp3_w, vec6_r, vec3_r);
 | 
			
		||||
  FDCT32_POSTPROC_NEG_W(vec4_r);
 | 
			
		||||
  FDCT32_POSTPROC_NEG_W(tmp3_w);
 | 
			
		||||
  FDCT32_POSTPROC_NEG_W(vec6_r);
 | 
			
		||||
@@ -401,10 +398,9 @@ static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
 | 
			
		||||
  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
 | 
			
		||||
               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
 | 
			
		||||
  /* Stage 3 */
 | 
			
		||||
  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
 | 
			
		||||
@@ -610,8 +606,8 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
 | 
			
		||||
  in3 = LD_SH(temp + 192);
 | 
			
		||||
  in5 = LD_SH(temp + 216);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
 | 
			
		||||
  /* 2nd set */
 | 
			
		||||
  in0_1 = LD_SH(temp + 16);
 | 
			
		||||
@@ -637,10 +633,10 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
 | 
			
		||||
  in6 = LD_SH(temp + 104);
 | 
			
		||||
  in7 = LD_SH(temp + 144);
 | 
			
		||||
 | 
			
		||||
  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
 | 
			
		||||
         output + 8, 32);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
 | 
			
		||||
         32);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
 | 
			
		||||
 | 
			
		||||
  /* 4th set */
 | 
			
		||||
@@ -655,12 +651,11 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
 | 
			
		||||
                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
 | 
			
		||||
  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
 | 
			
		||||
         output + 24, 32);
 | 
			
		||||
  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
 | 
			
		||||
         32);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
 | 
			
		||||
                            int16_t *output) {
 | 
			
		||||
static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
 | 
			
		||||
  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
 | 
			
		||||
  fdct8x32_1d_row_even(temp_buf, temp_buf);
 | 
			
		||||
  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
 | 
			
		||||
@@ -706,10 +701,9 @@ static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
 | 
			
		||||
  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
 | 
			
		||||
               in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
 | 
			
		||||
               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
 | 
			
		||||
  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
 | 
			
		||||
  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
 | 
			
		||||
 
 | 
			
		||||
@@ -22,20 +22,20 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
 | 
			
		||||
                  -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
 | 
			
		||||
  v8i16 coeff1 = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,
 | 
			
		||||
                   cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };
 | 
			
		||||
  v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
 | 
			
		||||
                   0, 0, 0, 0 };
 | 
			
		||||
  v8i16 coeff2 = {
 | 
			
		||||
    -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  LD_SH16(input, src_stride,
 | 
			
		||||
          in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
          in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
 | 
			
		||||
          in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  SLLI_4V(in0, in1, in2, in3, 2);
 | 
			
		||||
  SLLI_4V(in4, in5, in6, in7, 2);
 | 
			
		||||
  SLLI_4V(in8, in9, in10, in11, 2);
 | 
			
		||||
  SLLI_4V(in12, in13, in14, in15, 2);
 | 
			
		||||
  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
 | 
			
		||||
  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
 | 
			
		||||
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
 | 
			
		||||
                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 | 
			
		||||
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 | 
			
		||||
  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
 | 
			
		||||
  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
 | 
			
		||||
  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
 | 
			
		||||
@@ -137,10 +137,10 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
 | 
			
		||||
 | 
			
		||||
  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
                     in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
 | 
			
		||||
                     in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
 | 
			
		||||
  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
 | 
			
		||||
  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
 | 
			
		||||
@@ -150,19 +150,19 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
 | 
			
		||||
  SRA_4V(in8, in9, in10, in11, 2);
 | 
			
		||||
  SRA_4V(in12, in13, in14, in15, 2);
 | 
			
		||||
  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
 | 
			
		||||
               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
 | 
			
		||||
               tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
 | 
			
		||||
               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
 | 
			
		||||
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
 | 
			
		||||
                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 | 
			
		||||
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 | 
			
		||||
  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
 | 
			
		||||
  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
 | 
			
		||||
                   in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
 | 
			
		||||
                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
 | 
			
		||||
  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
 | 
			
		||||
               in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
 | 
			
		||||
                     tmp1, in1, tmp2, in2, tmp3, in3);
 | 
			
		||||
  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
 | 
			
		||||
                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
 | 
			
		||||
                     tmp5, in5, tmp6, in6, tmp7, in7);
 | 
			
		||||
  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -203,14 +203,14 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
 | 
			
		||||
  LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  SLLI_4V(in0, in1, in2, in3, 2);
 | 
			
		||||
  SLLI_4V(in4, in5, in6, in7, 2);
 | 
			
		||||
  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
            in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
            in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
 | 
			
		||||
            in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
 | 
			
		||||
            in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -14,27 +14,30 @@
 | 
			
		||||
#include "vpx_dsp/mips/txfm_macros_msa.h"
 | 
			
		||||
#include "vpx_dsp/txfm_common.h"
 | 
			
		||||
 | 
			
		||||
#define LD_HADD(psrc, stride) ({                                      \
 | 
			
		||||
#define LD_HADD(psrc, stride)                                                  \
 | 
			
		||||
  ({                                                                           \
 | 
			
		||||
    v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;              \
 | 
			
		||||
    v4i32 vec_w_m;                                                             \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                        \
 | 
			
		||||
    ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                            \
 | 
			
		||||
    LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);         \
 | 
			
		||||
  ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m,        \
 | 
			
		||||
       in4_m, in6_m, in0_m, in4_m);                                   \
 | 
			
		||||
    ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
 | 
			
		||||
         in0_m, in4_m);                                                        \
 | 
			
		||||
    in0_m += in4_m;                                                            \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                                    \
 | 
			
		||||
    HADD_SW_S32(vec_w_m);                                                      \
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) {     \
 | 
			
		||||
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                  \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                                  \
 | 
			
		||||
    v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
 | 
			
		||||
    v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                                      \
 | 
			
		||||
  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,          \
 | 
			
		||||
                    cospi_24_64, -cospi_8_64, 0, 0, 0 };            \
 | 
			
		||||
    v8i16 coeff_m = {                                                          \
 | 
			
		||||
      cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
 | 
			
		||||
    };                                                                         \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);           \
 | 
			
		||||
    ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);                \
 | 
			
		||||
@@ -52,32 +55,33 @@
 | 
			
		||||
    vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);               \
 | 
			
		||||
  PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m,       \
 | 
			
		||||
              vec7_m, vec7_m, out0, out2, out1, out3);              \
 | 
			
		||||
    PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m,        \
 | 
			
		||||
                vec7_m, out0, out2, out1, out3);                               \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) {        \
 | 
			
		||||
#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)              \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);      \
 | 
			
		||||
    SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);      \
 | 
			
		||||
  AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3,         \
 | 
			
		||||
             in0, in1, in2, in3);                                        \
 | 
			
		||||
  AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7,         \
 | 
			
		||||
             in4, in5, in6, in7);                                        \
 | 
			
		||||
    AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
 | 
			
		||||
               in2, in3);                                                    \
 | 
			
		||||
    AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
 | 
			
		||||
               in6, in7);                                                    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,            \
 | 
			
		||||
                  out0, out1, out2, out3, out4, out5, out6, out7) {  \
 | 
			
		||||
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
 | 
			
		||||
                  out3, out4, out5, out6, out7)                              \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                          \
 | 
			
		||||
    v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                      \
 | 
			
		||||
  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,           \
 | 
			
		||||
                    cospi_24_64, cospi_4_64, cospi_28_64,            \
 | 
			
		||||
                    cospi_12_64, cospi_20_64 };                      \
 | 
			
		||||
    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
 | 
			
		||||
                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    /* FDCT stage1 */                                                        \
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                \
 | 
			
		||||
              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);       \
 | 
			
		||||
    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
 | 
			
		||||
                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
 | 
			
		||||
    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
 | 
			
		||||
    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
 | 
			
		||||
    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
 | 
			
		||||
@@ -127,16 +131,17 @@
 | 
			
		||||
    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7,                \
 | 
			
		||||
                      out0, out1, out2, out3, out4, out5, out6, out7) {      \
 | 
			
		||||
#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
 | 
			
		||||
                      out2, out3, out4, out5, out6, out7)                    \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                    \
 | 
			
		||||
    v8i16 x0_m, x1_m, x2_m, x3_m;                                            \
 | 
			
		||||
    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
 | 
			
		||||
                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    /* FDCT stage1 */                                                        \
 | 
			
		||||
  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                        \
 | 
			
		||||
              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);               \
 | 
			
		||||
    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
 | 
			
		||||
                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
 | 
			
		||||
    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
 | 
			
		||||
    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
 | 
			
		||||
    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
 | 
			
		||||
@@ -186,23 +191,22 @@
 | 
			
		||||
    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define FDCT8x16_ODD(input0, input1, input2, input3,               \
 | 
			
		||||
                     input4, input5, input6, input7,               \
 | 
			
		||||
                     out1, out3, out5, out7,                       \
 | 
			
		||||
                     out9, out11, out13, out15) {                  \
 | 
			
		||||
#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,   \
 | 
			
		||||
                     input7, out1, out3, out5, out7, out9, out11, out13,       \
 | 
			
		||||
                     out15)                                                    \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;                \
 | 
			
		||||
    v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;                \
 | 
			
		||||
    v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                                    \
 | 
			
		||||
    v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                              \
 | 
			
		||||
    v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                                  \
 | 
			
		||||
  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,         \
 | 
			
		||||
                    cospi_24_64, -cospi_8_64, -cospi_24_64,        \
 | 
			
		||||
                    cospi_12_64, cospi_20_64 };                    \
 | 
			
		||||
  v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64,         \
 | 
			
		||||
                     cospi_18_64, cospi_10_64, cospi_22_64,        \
 | 
			
		||||
                     cospi_6_64, cospi_26_64 };                    \
 | 
			
		||||
  v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64,      \
 | 
			
		||||
                     -cospi_26_64, 0, 0, 0, 0 };                   \
 | 
			
		||||
    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,     \
 | 
			
		||||
                      -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };   \
 | 
			
		||||
    v8i16 coeff1_m = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,     \
 | 
			
		||||
                       cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };   \
 | 
			
		||||
    v8i16 coeff2_m = {                                                         \
 | 
			
		||||
      -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0        \
 | 
			
		||||
    };                                                                         \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    /* stp 1 */                                                                \
 | 
			
		||||
    ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);                \
 | 
			
		||||
@@ -218,10 +222,10 @@
 | 
			
		||||
    stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);                  \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    /* stp2 */                                                                 \
 | 
			
		||||
  BUTTERFLY_4(input0, input1, stp22_m, stp23_m,                    \
 | 
			
		||||
              stp30_m, stp31_m, stp32_m, stp33_m);                 \
 | 
			
		||||
  BUTTERFLY_4(input7, input6, stp25_m, stp24_m,                    \
 | 
			
		||||
              stp37_m, stp36_m, stp35_m, stp34_m);                 \
 | 
			
		||||
    BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m,   \
 | 
			
		||||
                stp33_m);                                                      \
 | 
			
		||||
    BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m,   \
 | 
			
		||||
                stp34_m);                                                      \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);            \
 | 
			
		||||
    ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);            \
 | 
			
		||||
@@ -243,10 +247,10 @@
 | 
			
		||||
    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    /* stp4 */                                                                 \
 | 
			
		||||
  BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m,                  \
 | 
			
		||||
              vec6_m, vec2_m, vec4_m, vec5_m);                     \
 | 
			
		||||
  BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m,                  \
 | 
			
		||||
              stp21_m, stp23_m, stp24_m, stp31_m);                 \
 | 
			
		||||
    BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m,    \
 | 
			
		||||
                vec5_m);                                                       \
 | 
			
		||||
    BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
 | 
			
		||||
                stp31_m);                                                      \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                               \
 | 
			
		||||
    SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                            \
 | 
			
		||||
@@ -288,7 +292,8 @@
 | 
			
		||||
    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) {      \
 | 
			
		||||
#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
 | 
			
		||||
  {                                        \
 | 
			
		||||
    v8i16 tp0_m, tp1_m;                    \
 | 
			
		||||
    v8i16 one_m = __msa_ldi_h(1);          \
 | 
			
		||||
                                           \
 | 
			
		||||
@@ -304,7 +309,8 @@
 | 
			
		||||
    vec1 >>= 2;                            \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define FDCT32_POSTPROC_NEG_W(vec) {      \
 | 
			
		||||
#define FDCT32_POSTPROC_NEG_W(vec)   \
 | 
			
		||||
  {                                  \
 | 
			
		||||
    v4i32 temp_m;                    \
 | 
			
		||||
    v4i32 one_m = __msa_ldi_w(1);    \
 | 
			
		||||
                                     \
 | 
			
		||||
@@ -315,7 +321,8 @@
 | 
			
		||||
    vec >>= 2;                       \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) {      \
 | 
			
		||||
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1)        \
 | 
			
		||||
  {                                                 \
 | 
			
		||||
    v8i16 tp0_m, tp1_m;                             \
 | 
			
		||||
    v8i16 one = __msa_ldi_h(1);                     \
 | 
			
		||||
                                                    \
 | 
			
		||||
@@ -333,9 +340,9 @@
 | 
			
		||||
    vec1 >>= 2;                                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right,      \
 | 
			
		||||
                          reg1_right, const0, const1,            \
 | 
			
		||||
                          out0, out1, out2, out3) {              \
 | 
			
		||||
#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
 | 
			
		||||
                          const0, const1, out0, out1, out2, out3)       \
 | 
			
		||||
  {                                                                     \
 | 
			
		||||
    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
 | 
			
		||||
    v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                                   \
 | 
			
		||||
    v4i32 k0_m = __msa_fill_w((int32_t)const0);                         \
 | 
			
		||||
 
 | 
			
		||||
@@ -20,10 +20,10 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
 | 
			
		||||
  input += 8;
 | 
			
		||||
  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
 | 
			
		||||
                     reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
 | 
			
		||||
                     reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
 | 
			
		||||
                     reg2, reg3, reg4, reg5, reg6, reg7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
 | 
			
		||||
                     reg9, reg10, reg11, reg12, reg13, reg14, reg15);
 | 
			
		||||
  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
 | 
			
		||||
  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
 | 
			
		||||
  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
 | 
			
		||||
@@ -93,13 +93,13 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
 | 
			
		||||
  reg3 = tmp7;
 | 
			
		||||
 | 
			
		||||
  /* transpose block */
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
 | 
			
		||||
                     reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
 | 
			
		||||
                     reg2, reg4, reg6, reg8, reg10, reg12, reg14);
 | 
			
		||||
  ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
 | 
			
		||||
 | 
			
		||||
  /* transpose block */
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
 | 
			
		||||
                     reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
 | 
			
		||||
                     reg13, reg11, reg5, reg7, reg9, reg1, reg15);
 | 
			
		||||
  ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -244,8 +244,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
        "sw     $zero,  28(%[out])     \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
        : [out] "r" (out)
 | 
			
		||||
    );
 | 
			
		||||
        : [out] "r"(out));
 | 
			
		||||
 | 
			
		||||
    out += 16;
 | 
			
		||||
  }
 | 
			
		||||
@@ -283,8 +282,8 @@ void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
 | 
			
		||||
    CLIP_SH4_0_255(res0, res1, res2, res3);
 | 
			
		||||
    CLIP_SH4_0_255(res4, res5, res6, res7);
 | 
			
		||||
    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
 | 
			
		||||
                tmp0, tmp1, tmp2, tmp3);
 | 
			
		||||
    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3);
 | 
			
		||||
    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
  }
 | 
			
		||||
@@ -295,29 +294,28 @@ void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
 | 
			
		||||
  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
 | 
			
		||||
 | 
			
		||||
  /* load input data */
 | 
			
		||||
  LD_SH16(input, 8,
 | 
			
		||||
          l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
 | 
			
		||||
                     l0, l1, l2, l3, l4, l5, l6, l7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
 | 
			
		||||
                     l8, l9, l10, l11, l12, l13, l14, l15);
 | 
			
		||||
  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
 | 
			
		||||
          l7, l15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
 | 
			
		||||
                     l7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
 | 
			
		||||
                     l12, l13, l14, l15);
 | 
			
		||||
 | 
			
		||||
  /* ADST in horizontal */
 | 
			
		||||
  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
 | 
			
		||||
                   l8, l9, l10, l11, l12, l13, l14, l15,
 | 
			
		||||
                   r0, r1, r2, r3, r4, r5, r6, r7,
 | 
			
		||||
                   r8, r9, r10, r11, r12, r13, r14, r15);
 | 
			
		||||
  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
 | 
			
		||||
                   l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
 | 
			
		||||
                   r12, r13, r14, r15);
 | 
			
		||||
 | 
			
		||||
  l1 = -r8;
 | 
			
		||||
  l3 = -r4;
 | 
			
		||||
  l13 = -r13;
 | 
			
		||||
  l15 = -r1;
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
 | 
			
		||||
                     l0, l1, l2, l3, l4, l5, l6, l7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
 | 
			
		||||
                     l6, l7);
 | 
			
		||||
  ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
 | 
			
		||||
                     l8, l9, l10, l11, l12, l13, l14, l15);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
 | 
			
		||||
                     l13, l14, l15);
 | 
			
		||||
  ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -17,10 +17,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
 | 
			
		||||
  /* 1st & 2nd 8x8 */
 | 
			
		||||
  LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
 | 
			
		||||
                     m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
 | 
			
		||||
                     m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
 | 
			
		||||
                     n3);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
 | 
			
		||||
                     n7);
 | 
			
		||||
  ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
 | 
			
		||||
  ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
 | 
			
		||||
  ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
 | 
			
		||||
@@ -28,10 +28,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
 | 
			
		||||
  /* 3rd & 4th 8x8 */
 | 
			
		||||
  LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
 | 
			
		||||
                     m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
 | 
			
		||||
                     m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
 | 
			
		||||
                     n3);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
 | 
			
		||||
                     n7);
 | 
			
		||||
  ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
 | 
			
		||||
  ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
 | 
			
		||||
  ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
 | 
			
		||||
@@ -186,8 +186,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
 | 
			
		||||
  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
 | 
			
		||||
 | 
			
		||||
  /* 4 Stores */
 | 
			
		||||
  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
 | 
			
		||||
       vec0, vec1, vec2, vec3);
 | 
			
		||||
  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
 | 
			
		||||
  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
 | 
			
		||||
  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
 | 
			
		||||
 | 
			
		||||
@@ -198,8 +197,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
 | 
			
		||||
  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
 | 
			
		||||
 | 
			
		||||
  /* 4 Stores */
 | 
			
		||||
  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
 | 
			
		||||
       vec1, vec2, vec0, vec3);
 | 
			
		||||
  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
 | 
			
		||||
  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
 | 
			
		||||
  ST_SH(reg0, (tmp_odd_buf + 13 * 8));
 | 
			
		||||
  ST_SH(reg1, (tmp_odd_buf + 14 * 8));
 | 
			
		||||
@@ -213,8 +211,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
 | 
			
		||||
  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
 | 
			
		||||
  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
 | 
			
		||||
 | 
			
		||||
  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
 | 
			
		||||
       loc0, loc1, loc2, loc3);
 | 
			
		||||
  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
 | 
			
		||||
  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
 | 
			
		||||
 | 
			
		||||
  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
 | 
			
		||||
@@ -228,8 +225,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
 | 
			
		||||
  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
 | 
			
		||||
  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
 | 
			
		||||
 | 
			
		||||
  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
 | 
			
		||||
       loc0, loc1, loc2, loc3);
 | 
			
		||||
  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
 | 
			
		||||
  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
 | 
			
		||||
 | 
			
		||||
  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
 | 
			
		||||
@@ -242,8 +238,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
 | 
			
		||||
 | 
			
		||||
static void idct_butterfly_transpose_store(int16_t *tmp_buf,
 | 
			
		||||
                                           int16_t *tmp_eve_buf,
 | 
			
		||||
                                           int16_t *tmp_odd_buf,
 | 
			
		||||
                                           int16_t *dst) {
 | 
			
		||||
                                           int16_t *tmp_odd_buf, int16_t *dst) {
 | 
			
		||||
  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
 | 
			
		||||
  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
 | 
			
		||||
 | 
			
		||||
@@ -317,26 +312,26 @@ static void idct_butterfly_transpose_store(int16_t *tmp_buf,
 | 
			
		||||
 | 
			
		||||
  /* Transpose : 16 vectors */
 | 
			
		||||
  /* 1st & 2nd 8x8 */
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
 | 
			
		||||
                     m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
 | 
			
		||||
                     n3);
 | 
			
		||||
  ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
 | 
			
		||||
  ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
 | 
			
		||||
                     m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
 | 
			
		||||
                     n7);
 | 
			
		||||
  ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
 | 
			
		||||
  ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
 | 
			
		||||
 | 
			
		||||
  /* 3rd & 4th 8x8 */
 | 
			
		||||
  LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
 | 
			
		||||
                     m0, n0, m1, n1, m2, n2, m3, n3);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
 | 
			
		||||
                     n3);
 | 
			
		||||
  ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
 | 
			
		||||
  ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
 | 
			
		||||
                     m4, n4, m5, n5, m6, n6, m7, n7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
 | 
			
		||||
                     n7);
 | 
			
		||||
  ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
 | 
			
		||||
  ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
 | 
			
		||||
}
 | 
			
		||||
@@ -349,8 +344,8 @@ static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
 | 
			
		||||
  idct32x8_row_transpose_store(input, &tmp_buf[0]);
 | 
			
		||||
  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
 | 
			
		||||
  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
 | 
			
		||||
  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
 | 
			
		||||
                                 &tmp_odd_buf[0], output);
 | 
			
		||||
  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
 | 
			
		||||
                                 output);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void idct8x32_column_even_process_store(int16_t *tmp_buf,
 | 
			
		||||
@@ -541,8 +536,7 @@ static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 | 
			
		||||
                                             int16_t *tmp_odd_buf,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int16_t *tmp_odd_buf, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride) {
 | 
			
		||||
  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
 | 
			
		||||
  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
 | 
			
		||||
@@ -563,8 +557,8 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 | 
			
		||||
 | 
			
		||||
  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
 | 
			
		||||
  SRARI_H4_SH(m0, m2, m4, m6, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      m0, m2, m4, m6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
 | 
			
		||||
                      m6);
 | 
			
		||||
 | 
			
		||||
  /* Load 8 & Store 8 */
 | 
			
		||||
  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
 | 
			
		||||
@@ -578,13 +572,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 | 
			
		||||
 | 
			
		||||
  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
 | 
			
		||||
  SRARI_H4_SH(m1, m3, m5, m7, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      m1, m3, m5, m7);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
 | 
			
		||||
 | 
			
		||||
  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
 | 
			
		||||
  SRARI_H4_SH(m1, m3, m5, m7, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      m1, m3, m5, m7);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
 | 
			
		||||
                      m7);
 | 
			
		||||
 | 
			
		||||
  /* Load 8 & Store 8 */
 | 
			
		||||
  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
 | 
			
		||||
@@ -598,13 +591,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 | 
			
		||||
 | 
			
		||||
  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
 | 
			
		||||
  SRARI_H4_SH(n0, n2, n4, n6, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      n0, n2, n4, n6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
 | 
			
		||||
 | 
			
		||||
  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
 | 
			
		||||
  SRARI_H4_SH(n0, n2, n4, n6, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      n0, n2, n4, n6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
 | 
			
		||||
                      n6);
 | 
			
		||||
 | 
			
		||||
  /* Load 8 & Store 8 */
 | 
			
		||||
  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
 | 
			
		||||
@@ -618,13 +610,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 | 
			
		||||
 | 
			
		||||
  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
 | 
			
		||||
  SRARI_H4_SH(n1, n3, n5, n7, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      n1, n3, n5, n7);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
 | 
			
		||||
 | 
			
		||||
  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
 | 
			
		||||
  SRARI_H4_SH(n1, n3, n5, n7, 6);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
 | 
			
		||||
                      n1, n3, n5, n7);
 | 
			
		||||
  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
 | 
			
		||||
                      n7);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
 | 
			
		||||
@@ -634,8 +625,8 @@ static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
 | 
			
		||||
 | 
			
		||||
  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
 | 
			
		||||
  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
 | 
			
		||||
  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
 | 
			
		||||
                                   dst, dst_stride);
 | 
			
		||||
  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
 | 
			
		||||
                                   dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
@@ -684,8 +675,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
        "sw     $zero,     60(%[out_ptr])     \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
        : [out_ptr] "r" (out_ptr)
 | 
			
		||||
    );
 | 
			
		||||
        : [out_ptr] "r"(out_ptr));
 | 
			
		||||
 | 
			
		||||
    out_ptr += 32;
 | 
			
		||||
  }
 | 
			
		||||
@@ -728,8 +718,8 @@ void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
 | 
			
		||||
    CLIP_SH4_0_255(res0, res1, res2, res3);
 | 
			
		||||
    CLIP_SH4_0_255(res4, res5, res6, res7);
 | 
			
		||||
    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
 | 
			
		||||
                tmp0, tmp1, tmp2, tmp3);
 | 
			
		||||
    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3);
 | 
			
		||||
 | 
			
		||||
    ST_UB2(tmp0, tmp1, dst, 16);
 | 
			
		||||
    dst += dst_stride;
 | 
			
		||||
 
 | 
			
		||||
@@ -42,8 +42,8 @@ void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
  in0_r -= in3_r;
 | 
			
		||||
  in2_r += in1_r;
 | 
			
		||||
 | 
			
		||||
  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r,
 | 
			
		||||
              in0, in1, in2, in3);
 | 
			
		||||
  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
 | 
			
		||||
              in2, in3);
 | 
			
		||||
  ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -18,17 +18,17 @@ void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
 | 
			
		||||
  /* rows transform */
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  /* 1D idct8x8 */
 | 
			
		||||
  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                 in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                 in4, in5, in6, in7);
 | 
			
		||||
  /* columns transform */
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  /* 1D idct8x8 */
 | 
			
		||||
  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                 in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                 in4, in5, in6, in7);
 | 
			
		||||
  /* final rounding (add 2^4, divide by 2^5) and shift */
 | 
			
		||||
  SRARI_H4_SH(in0, in1, in2, in3, 5);
 | 
			
		||||
  SRARI_H4_SH(in4, in5, in6, in7, 5);
 | 
			
		||||
@@ -82,12 +82,12 @@ void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
 | 
			
		||||
  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
 | 
			
		||||
 | 
			
		||||
  /* stage4 */
 | 
			
		||||
  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
 | 
			
		||||
              in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                     in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
 | 
			
		||||
                 in0, in1, in2, in3, in4, in5, in6, in7);
 | 
			
		||||
  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
 | 
			
		||||
              in7);
 | 
			
		||||
  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                     in4, in5, in6, in7);
 | 
			
		||||
  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
 | 
			
		||||
                 in4, in5, in6, in7);
 | 
			
		||||
 | 
			
		||||
  /* final rounding (add 2^4, divide by 2^5) and shift */
 | 
			
		||||
  SRARI_H4_SH(in0, in1, in2, in3, 5);
 | 
			
		||||
 
 | 
			
		||||
@@ -146,16 +146,13 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "sw         %[tmp16],     8(%[dst])                    \n\t"
 | 
			
		||||
      "sw         %[tmp16],     12(%[dst])                   \n\t"
 | 
			
		||||
 | 
			
		||||
      : [tmp1] "=&r" (tmp1),   [tmp2] "=&r" (tmp2),
 | 
			
		||||
        [tmp3] "=&r" (tmp3),   [tmp4] "=&r" (tmp4),
 | 
			
		||||
        [tmp5] "=&r" (tmp5),   [tmp7] "=&r" (tmp7),
 | 
			
		||||
        [tmp6] "=&r" (tmp6),   [tmp8] "=&r" (tmp8),
 | 
			
		||||
        [tmp9] "=&r" (tmp9),   [tmp10] "=&r" (tmp10),
 | 
			
		||||
        [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
 | 
			
		||||
        [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
 | 
			
		||||
        [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
 | 
			
		||||
      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
 | 
			
		||||
  );
 | 
			
		||||
      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
 | 
			
		||||
        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
 | 
			
		||||
        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
 | 
			
		||||
        [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
 | 
			
		||||
        [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
 | 
			
		||||
        [tmp16] "=&r"(tmp16)
 | 
			
		||||
      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -316,14 +313,12 @@ void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "sw              %[expected_dc],      8(%[dst])                     \n\t"
 | 
			
		||||
      "sw              %[expected_dc],      12(%[dst])                    \n\t"
 | 
			
		||||
 | 
			
		||||
      : [left1] "=&r" (left1), [above1] "=&r" (above1),
 | 
			
		||||
        [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
 | 
			
		||||
        [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
 | 
			
		||||
        [above2] "=&r" (above2), [left2] "=&r" (left2),
 | 
			
		||||
        [average] "=&r" (average), [tmp] "=&r" (tmp),
 | 
			
		||||
      : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
 | 
			
		||||
        [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
 | 
			
		||||
        [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
 | 
			
		||||
        [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
 | 
			
		||||
        [expected_dc] "=&r"(expected_dc)
 | 
			
		||||
      : [above] "r" (above), [left] "r" (left),
 | 
			
		||||
        [dst] "r" (dst), [stride] "r" (stride)
 | 
			
		||||
  );
 | 
			
		||||
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
 | 
			
		||||
        [stride] "r"(stride));
 | 
			
		||||
}
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
 
 | 
			
		||||
@@ -32,10 +32,9 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "add        %[dst],       %[dst],         %[stride]    \n\t"
 | 
			
		||||
      "sw         %[tmp4],      (%[dst])                     \n\t"
 | 
			
		||||
 | 
			
		||||
      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
 | 
			
		||||
        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
 | 
			
		||||
      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
 | 
			
		||||
  );
 | 
			
		||||
      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
 | 
			
		||||
        [tmp4] "=&r"(tmp4)
 | 
			
		||||
      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -75,9 +74,8 @@ void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
        [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
 | 
			
		||||
        [average] "=&r"(average), [tmp] "=&r"(tmp),
 | 
			
		||||
        [expected_dc] "=&r"(expected_dc)
 | 
			
		||||
      : [above] "r" (above), [left] "r" (left),
 | 
			
		||||
        [dst] "r" (dst), [stride] "r" (stride)
 | 
			
		||||
  );
 | 
			
		||||
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
 | 
			
		||||
        [stride] "r"(stride));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -174,7 +172,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "sra             %[res0],        %[res0],           16             \n\t"
 | 
			
		||||
      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      "sra             %[res1],        %[resr],           16             \n\t"
 | 
			
		||||
      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
 | 
			
		||||
      "sb              %[res0],        (%[dst])                          \n\t"
 | 
			
		||||
@@ -183,7 +180,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "sra             %[res0],        %[res0],           16             \n\t"
 | 
			
		||||
      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      "sb              %[res1],        1(%[dst])                         \n\t"
 | 
			
		||||
      "sra             %[res1],        %[resl],           16             \n\t"
 | 
			
		||||
      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
 | 
			
		||||
@@ -218,12 +214,11 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "sb              %[res0],        2(%[dst])                         \n\t"
 | 
			
		||||
      "sb              %[res1],        3(%[dst])                         \n\t"
 | 
			
		||||
 | 
			
		||||
      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
 | 
			
		||||
        [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
 | 
			
		||||
        [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
 | 
			
		||||
        [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
 | 
			
		||||
      : [above] "r" (above), [left] "r" (left),
 | 
			
		||||
        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
 | 
			
		||||
  );
 | 
			
		||||
      : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
 | 
			
		||||
        [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
 | 
			
		||||
        [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
 | 
			
		||||
        [resr] "=&r"(resr), [top_left] "=&r"(top_left)
 | 
			
		||||
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
 | 
			
		||||
        [stride] "r"(stride), [cm] "r"(cm));
 | 
			
		||||
}
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
 
 | 
			
		||||
@@ -58,13 +58,10 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      "sw         %[tmp8],      (%[dst])                    \n\t"
 | 
			
		||||
      "sw         %[tmp8],      4(%[dst])                   \n\t"
 | 
			
		||||
 | 
			
		||||
      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
 | 
			
		||||
        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
 | 
			
		||||
        [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
 | 
			
		||||
      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
 | 
			
		||||
        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
 | 
			
		||||
        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
 | 
			
		||||
      : [left] "r" (left), [dst] "r" (dst),
 | 
			
		||||
        [stride] "r" (stride)
 | 
			
		||||
  );
 | 
			
		||||
      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -146,8 +143,7 @@ void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
        [average] "=&r"(average), [tmp] "=&r"(tmp),
 | 
			
		||||
        [expected_dc] "=&r"(expected_dc)
 | 
			
		||||
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
 | 
			
		||||
        [stride] "r" (stride)
 | 
			
		||||
  );
 | 
			
		||||
        [stride] "r"(stride));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
@@ -598,10 +594,9 @@ void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
 | 
			
		||||
      : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
 | 
			
		||||
        [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
 | 
			
		||||
        [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
 | 
			
		||||
        [res0] "=&r" (res0), [res1] "=&r" (res1),
 | 
			
		||||
        [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
 | 
			
		||||
      : [above] "r" (above), [left] "r" (left),
 | 
			
		||||
        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
 | 
			
		||||
  );
 | 
			
		||||
        [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
 | 
			
		||||
        [top_left] "=&r"(top_left)
 | 
			
		||||
      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
 | 
			
		||||
        [stride] "r"(stride), [cm] "r"(cm));
 | 
			
		||||
}
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
 
 | 
			
		||||
@@ -11,7 +11,8 @@
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
#include "vpx_dsp/mips/macros_msa.h"
 | 
			
		||||
 | 
			
		||||
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
 | 
			
		||||
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
 | 
			
		||||
  {                                             \
 | 
			
		||||
    out0 = __msa_subs_u_h(out0, in0);           \
 | 
			
		||||
    out1 = __msa_subs_u_h(out1, in1);           \
 | 
			
		||||
  }
 | 
			
		||||
@@ -150,8 +151,8 @@ static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
 | 
			
		||||
                                     const uint8_t *src_left,
 | 
			
		||||
                                     uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                     const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                     int32_t dst_stride) {
 | 
			
		||||
  uint32_t val0, val1;
 | 
			
		||||
  v16i8 store, src = { 0 };
 | 
			
		||||
  v8u16 sum_h;
 | 
			
		||||
@@ -199,8 +200,8 @@ static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
 | 
			
		||||
                                     const uint8_t *src_left,
 | 
			
		||||
                                     uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                     const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                     int32_t dst_stride) {
 | 
			
		||||
  uint64_t val0, val1;
 | 
			
		||||
  v16i8 store;
 | 
			
		||||
  v16u8 src = { 0 };
 | 
			
		||||
@@ -260,8 +261,8 @@ static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
 | 
			
		||||
                                       const uint8_t *src_left,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                       const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride) {
 | 
			
		||||
  v16u8 top, left, out;
 | 
			
		||||
  v8u16 sum_h, sum_top, sum_left;
 | 
			
		||||
  v4u32 sum_w;
 | 
			
		||||
@@ -313,8 +314,8 @@ static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
 | 
			
		||||
                                       const uint8_t *src_left,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                       const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride) {
 | 
			
		||||
  uint32_t row;
 | 
			
		||||
  v16u8 top0, top1, left0, left1, out;
 | 
			
		||||
  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
 | 
			
		||||
@@ -381,8 +382,8 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
 | 
			
		||||
                                     const uint8_t *src_left,
 | 
			
		||||
                                     uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                     const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                     int32_t dst_stride) {
 | 
			
		||||
  uint32_t val;
 | 
			
		||||
  uint8_t top_left = src_top_ptr[-1];
 | 
			
		||||
  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
 | 
			
		||||
@@ -409,8 +410,8 @@ static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
 | 
			
		||||
                                     const uint8_t *src_left,
 | 
			
		||||
                                     uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                     const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                     int32_t dst_stride) {
 | 
			
		||||
  uint64_t val;
 | 
			
		||||
  uint8_t top_left = src_top_ptr[-1];
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
@@ -442,8 +443,8 @@ static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
 | 
			
		||||
                                       const uint8_t *src_left,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                       const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride) {
 | 
			
		||||
  uint8_t top_left = src_top_ptr[-1];
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
 | 
			
		||||
@@ -491,8 +492,8 @@ static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
 | 
			
		||||
                                       const uint8_t *src_left,
 | 
			
		||||
                                       uint8_t *dst, int32_t dst_stride) {
 | 
			
		||||
                                       const uint8_t *src_left, uint8_t *dst,
 | 
			
		||||
                                       int32_t dst_stride) {
 | 
			
		||||
  uint8_t top_left = src_top[-1];
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
 | 
			
		||||
 
 | 
			
		||||
@@ -23,31 +23,39 @@ extern "C" {
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
 | 
			
		||||
#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
 | 
			
		||||
  ({                                                                           \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    int32_t tmp, out;                                                          \
 | 
			
		||||
    int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
 | 
			
		||||
    int in = input;                                                            \
 | 
			
		||||
                                                                               \
 | 
			
		||||
  __asm__ __volatile__ (                                                       \
 | 
			
		||||
      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
 | 
			
		||||
      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
 | 
			
		||||
      "mthi     $zero,                  $ac1                              \n\t"\
 | 
			
		||||
      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
 | 
			
		||||
      "extp     %[tmp],                 $ac1,             31              \n\t"\
 | 
			
		||||
    __asm__ __volatile__(/* out = dct_const_round_shift(dc *  cospi_16_64); */ \
 | 
			
		||||
                         "mtlo     %[dct_cost_rounding],   $ac1              " \
 | 
			
		||||
                         "                \n\t"                                \
 | 
			
		||||
                         "mthi     $zero,                  $ac1              " \
 | 
			
		||||
                         "                \n\t"                                \
 | 
			
		||||
                         "madd     $ac1,                   %[in],            " \
 | 
			
		||||
                         "%[cospi_16_64]  \n\t"                                \
 | 
			
		||||
                         "extp     %[tmp],                 $ac1,             " \
 | 
			
		||||
                         "31              \n\t"                                \
 | 
			
		||||
                                                                               \
 | 
			
		||||
                         /* out = dct_const_round_shift(out * cospi_16_64); */ \
 | 
			
		||||
      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
 | 
			
		||||
      "mthi     $zero,                  $ac2                              \n\t"\
 | 
			
		||||
      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
 | 
			
		||||
      "extp     %[out],                 $ac2,             31              \n\t"\
 | 
			
		||||
                         "mtlo     %[dct_cost_rounding],   $ac2              " \
 | 
			
		||||
                         "                \n\t"                                \
 | 
			
		||||
                         "mthi     $zero,                  $ac2              " \
 | 
			
		||||
                         "                \n\t"                                \
 | 
			
		||||
                         "madd     $ac2,                   %[tmp],           " \
 | 
			
		||||
                         "%[cospi_16_64]  \n\t"                                \
 | 
			
		||||
                         "extp     %[out],                 $ac2,             " \
 | 
			
		||||
                         "31              \n\t"                                \
 | 
			
		||||
                                                                               \
 | 
			
		||||
                         : [tmp] "=&r"(tmp), [out] "=r"(out)                   \
 | 
			
		||||
                         : [in] "r"(in),                                       \
 | 
			
		||||
                           [dct_cost_rounding] "r"(dct_cost_rounding),         \
 | 
			
		||||
        [cospi_16_64] "r" (cospi_16_64)                                        \
 | 
			
		||||
   );                                                                          \
 | 
			
		||||
  out;                                                                    })
 | 
			
		||||
                           [cospi_16_64] "r"(cospi_16_64));                    \
 | 
			
		||||
    out;                                                                       \
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
                                   int dest_stride);
 | 
			
		||||
@@ -59,10 +67,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
 | 
			
		||||
void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
                                 int dest_stride);
 | 
			
		||||
void iadst8_dspr2(const int16_t *input, int16_t *output);
 | 
			
		||||
void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
                       uint32_t no_rows);
 | 
			
		||||
void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
                               int dest_stride);
 | 
			
		||||
void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
 | 
			
		||||
void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
 | 
			
		||||
void iadst16_dspr2(const int16_t *input, int16_t *output);
 | 
			
		||||
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
 
 | 
			
		||||
@@ -15,14 +15,15 @@
 | 
			
		||||
#include "vpx_dsp/mips/txfm_macros_msa.h"
 | 
			
		||||
#include "vpx_dsp/txfm_common.h"
 | 
			
		||||
 | 
			
		||||
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,               \
 | 
			
		||||
                  out0, out1, out2, out3, out4, out5, out6, out7) {     \
 | 
			
		||||
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
 | 
			
		||||
                  out3, out4, out5, out6, out7)                              \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
 | 
			
		||||
    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
 | 
			
		||||
    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
 | 
			
		||||
                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
 | 
			
		||||
  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
 | 
			
		||||
    -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                    \
 | 
			
		||||
    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
 | 
			
		||||
                       cospi_24_64, -cospi_24_64, 0,           0 };          \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
 | 
			
		||||
    cnst2_m = -cnst0_m;                                                      \
 | 
			
		||||
@@ -33,9 +34,8 @@
 | 
			
		||||
                                                                             \
 | 
			
		||||
    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
 | 
			
		||||
    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
 | 
			
		||||
  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
 | 
			
		||||
                        cnst1_m, cnst2_m, cnst3_m, in7, in0,            \
 | 
			
		||||
                        in4, in3);                                      \
 | 
			
		||||
    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
 | 
			
		||||
                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
 | 
			
		||||
    cnst2_m = -cnst0_m;                                                      \
 | 
			
		||||
@@ -47,15 +47,13 @@
 | 
			
		||||
    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
 | 
			
		||||
    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
 | 
			
		||||
                                                                             \
 | 
			
		||||
  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
 | 
			
		||||
                        cnst1_m, cnst2_m, cnst3_m, in5, in2,            \
 | 
			
		||||
                        in6, in1);                                      \
 | 
			
		||||
    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
 | 
			
		||||
                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
 | 
			
		||||
    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
 | 
			
		||||
    out7 = -s0_m;                                                            \
 | 
			
		||||
    out0 = s1_m;                                                             \
 | 
			
		||||
                                                                             \
 | 
			
		||||
  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
 | 
			
		||||
               cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
 | 
			
		||||
    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
 | 
			
		||||
    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
 | 
			
		||||
@@ -63,9 +61,8 @@
 | 
			
		||||
                                                                             \
 | 
			
		||||
    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
 | 
			
		||||
    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
 | 
			
		||||
  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
 | 
			
		||||
                        cnst2_m, cnst3_m, cnst1_m, out1, out6,          \
 | 
			
		||||
                        s0_m, s1_m);                                    \
 | 
			
		||||
    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
 | 
			
		||||
                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
 | 
			
		||||
                                                                             \
 | 
			
		||||
    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
 | 
			
		||||
    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
 | 
			
		||||
@@ -82,7 +79,8 @@
 | 
			
		||||
    out5 = -out5;                                                            \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({  \
 | 
			
		||||
#define VP9_SET_COSPI_PAIR(c0_h, c1_h)  \
 | 
			
		||||
  ({                                    \
 | 
			
		||||
    v8i16 out0_m, r0_m, r1_m;           \
 | 
			
		||||
                                        \
 | 
			
		||||
    r0_m = __msa_fill_h(c0_h);          \
 | 
			
		||||
@@ -92,7 +90,8 @@
 | 
			
		||||
    out0_m;                             \
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) {  \
 | 
			
		||||
#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)               \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    uint8_t *dst_m = (uint8_t *)(dst);                                         \
 | 
			
		||||
    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                                      \
 | 
			
		||||
    v16i8 tmp0_m, tmp1_m;                                                      \
 | 
			
		||||
@@ -100,16 +99,17 @@
 | 
			
		||||
    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);                 \
 | 
			
		||||
  ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,        \
 | 
			
		||||
             zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);       \
 | 
			
		||||
  ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,          \
 | 
			
		||||
    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
 | 
			
		||||
               res0_m, res1_m, res2_m, res3_m);                                \
 | 
			
		||||
    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m,   \
 | 
			
		||||
         res2_m, res3_m);                                                      \
 | 
			
		||||
    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                            \
 | 
			
		||||
    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);               \
 | 
			
		||||
    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                               \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) {   \
 | 
			
		||||
#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)             \
 | 
			
		||||
  {                                                                         \
 | 
			
		||||
    v8i16 c0_m, c1_m, c2_m, c3_m;                                           \
 | 
			
		||||
    v8i16 step0_m, step1_m;                                                 \
 | 
			
		||||
    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
 | 
			
		||||
@@ -127,20 +127,19 @@
 | 
			
		||||
                                                                            \
 | 
			
		||||
    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);            \
 | 
			
		||||
    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                        \
 | 
			
		||||
  BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m,                         \
 | 
			
		||||
              (v8i16)tmp2_m, (v8i16)tmp3_m,                         \
 | 
			
		||||
    BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
 | 
			
		||||
                out0, out1, out2, out3);                                    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
 | 
			
		||||
#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
 | 
			
		||||
  {                                                                    \
 | 
			
		||||
    v8i16 res0_m, res1_m, c0_m, c1_m;                                  \
 | 
			
		||||
    v8i16 k1_m, k2_m, k3_m, k4_m;                                      \
 | 
			
		||||
    v8i16 zero_m = { 0 };                                              \
 | 
			
		||||
    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
 | 
			
		||||
    v4i32 int0_m, int1_m, int2_m, int3_m;                              \
 | 
			
		||||
  v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
 | 
			
		||||
    sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                  \
 | 
			
		||||
    -sinpi_4_9 };                                                   \
 | 
			
		||||
    v8i16 mask_m = { sinpi_1_9,  sinpi_2_9,  sinpi_3_9,  sinpi_4_9,    \
 | 
			
		||||
                     -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
 | 
			
		||||
                                                                       \
 | 
			
		||||
    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);          \
 | 
			
		||||
    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                   \
 | 
			
		||||
@@ -182,7 +181,8 @@
 | 
			
		||||
    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);           \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({  \
 | 
			
		||||
#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)    \
 | 
			
		||||
  ({                                                  \
 | 
			
		||||
    v8i16 c0_m, c1_m;                                 \
 | 
			
		||||
                                                      \
 | 
			
		||||
    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
 | 
			
		||||
@@ -192,26 +192,28 @@
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
/* multiply and add macro */
 | 
			
		||||
#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,        \
 | 
			
		||||
                 out0, out1, out2, out3) {                              \
 | 
			
		||||
#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,  \
 | 
			
		||||
                 out2, out3)                                                  \
 | 
			
		||||
  {                                                                           \
 | 
			
		||||
    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
 | 
			
		||||
    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
 | 
			
		||||
                                                                              \
 | 
			
		||||
    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
 | 
			
		||||
    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
 | 
			
		||||
  DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
 | 
			
		||||
              cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
 | 
			
		||||
    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
 | 
			
		||||
                cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
 | 
			
		||||
    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
 | 
			
		||||
    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                  \
 | 
			
		||||
  DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
 | 
			
		||||
              cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
 | 
			
		||||
    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
 | 
			
		||||
                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
 | 
			
		||||
    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
 | 
			
		||||
    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);                  \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
/* idct 8x8 macro */
 | 
			
		||||
#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,               \
 | 
			
		||||
                       out0, out1, out2, out3, out4, out5, out6, out7) {     \
 | 
			
		||||
#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
 | 
			
		||||
                       out2, out3, out4, out5, out6, out7)                    \
 | 
			
		||||
  {                                                                           \
 | 
			
		||||
    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;             \
 | 
			
		||||
    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;             \
 | 
			
		||||
    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
 | 
			
		||||
@@ -236,59 +238,60 @@
 | 
			
		||||
    tp7_m = in7 + in5;                                                        \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                      \
 | 
			
		||||
    k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                       \
 | 
			
		||||
  VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
 | 
			
		||||
           in0, in4, in2, in6);                                              \
 | 
			
		||||
    VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
 | 
			
		||||
    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);              \
 | 
			
		||||
  BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
 | 
			
		||||
              out0, out1, out2, out3, out4, out5, out6, out7);               \
 | 
			
		||||
    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
 | 
			
		||||
                out1, out2, out3, out4, out5, out6, out7);                    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,            \
 | 
			
		||||
                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
 | 
			
		||||
#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
 | 
			
		||||
                        out2, out3, out4, out5, out6, out7)                   \
 | 
			
		||||
  {                                                                           \
 | 
			
		||||
    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                     \
 | 
			
		||||
    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                 \
 | 
			
		||||
    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;           \
 | 
			
		||||
  v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
 | 
			
		||||
    cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };    \
 | 
			
		||||
  v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
 | 
			
		||||
    cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };      \
 | 
			
		||||
  v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
 | 
			
		||||
    -cospi_16_64, 0, 0, 0, 0 };                                            \
 | 
			
		||||
    v8i16 mask1_m = { cospi_2_64,  cospi_30_64,  -cospi_2_64, cospi_10_64,    \
 | 
			
		||||
                      cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
 | 
			
		||||
    v8i16 mask2_m = { cospi_14_64,  -cospi_18_64, cospi_26_64, cospi_6_64,    \
 | 
			
		||||
                      -cospi_26_64, cospi_8_64,   cospi_24_64, -cospi_8_64 }; \
 | 
			
		||||
    v8i16 mask3_m = {                                                         \
 | 
			
		||||
      -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0         \
 | 
			
		||||
    };                                                                        \
 | 
			
		||||
                                                                              \
 | 
			
		||||
    k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                 \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                 \
 | 
			
		||||
    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                      \
 | 
			
		||||
  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
 | 
			
		||||
              r0_m, r1_m, r2_m, r3_m);                                     \
 | 
			
		||||
    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
 | 
			
		||||
                r1_m, r2_m, r3_m);                                            \
 | 
			
		||||
    k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                 \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                 \
 | 
			
		||||
    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                      \
 | 
			
		||||
  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
 | 
			
		||||
              r4_m, r5_m, r6_m, r7_m);                                     \
 | 
			
		||||
  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
 | 
			
		||||
       m0_m, m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
 | 
			
		||||
                r5_m, r6_m, r7_m);                                            \
 | 
			
		||||
    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
 | 
			
		||||
         m3_m);                                                               \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                      \
 | 
			
		||||
  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
 | 
			
		||||
       m0_m, m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
 | 
			
		||||
         m3_m);                                                               \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                          \
 | 
			
		||||
    k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                 \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                 \
 | 
			
		||||
    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                      \
 | 
			
		||||
  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
 | 
			
		||||
              r0_m, r1_m, r2_m, r3_m);                                     \
 | 
			
		||||
    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
 | 
			
		||||
                r1_m, r2_m, r3_m);                                            \
 | 
			
		||||
    k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                 \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                 \
 | 
			
		||||
    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                      \
 | 
			
		||||
  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
 | 
			
		||||
              r4_m, r5_m, r6_m, r7_m);                                     \
 | 
			
		||||
  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
 | 
			
		||||
       m0_m, m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
 | 
			
		||||
                r5_m, r6_m, r7_m);                                            \
 | 
			
		||||
    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
 | 
			
		||||
         m3_m);                                                               \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                      \
 | 
			
		||||
  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
 | 
			
		||||
       m0_m, m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
 | 
			
		||||
         m3_m);                                                               \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                          \
 | 
			
		||||
    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                      \
 | 
			
		||||
@@ -296,29 +299,29 @@
 | 
			
		||||
    k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                 \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                 \
 | 
			
		||||
    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                    \
 | 
			
		||||
  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
 | 
			
		||||
              r0_m, r1_m, r2_m, r3_m);                                     \
 | 
			
		||||
    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
 | 
			
		||||
                r1_m, r2_m, r3_m);                                            \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                 \
 | 
			
		||||
  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
 | 
			
		||||
              r4_m, r5_m, r6_m, r7_m);                                     \
 | 
			
		||||
  ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
 | 
			
		||||
       m0_m, m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m,   \
 | 
			
		||||
                r6_m, r7_m);                                                  \
 | 
			
		||||
    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
 | 
			
		||||
         m3_m);                                                               \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                           \
 | 
			
		||||
  SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
 | 
			
		||||
       m0_m, m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
 | 
			
		||||
         m3_m);                                                               \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                            \
 | 
			
		||||
    k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                 \
 | 
			
		||||
    k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                 \
 | 
			
		||||
    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                      \
 | 
			
		||||
  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
 | 
			
		||||
              m0_m, m1_m, m2_m, m3_m);                                     \
 | 
			
		||||
    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m,     \
 | 
			
		||||
                m1_m, m2_m, m3_m);                                            \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                           \
 | 
			
		||||
    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                        \
 | 
			
		||||
  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
 | 
			
		||||
              m0_m, m1_m, m2_m, m3_m);                                     \
 | 
			
		||||
    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m,   \
 | 
			
		||||
                m2_m, m3_m);                                                  \
 | 
			
		||||
    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
 | 
			
		||||
    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                           \
 | 
			
		||||
                                                                              \
 | 
			
		||||
@@ -328,11 +331,11 @@
 | 
			
		||||
    out7 = -in7;                                                              \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,        \
 | 
			
		||||
                         r9, r10, r11, r12, r13, r14, r15,          \
 | 
			
		||||
                         out0, out1, out2, out3, out4, out5,        \
 | 
			
		||||
                         out6, out7, out8, out9, out10, out11,      \
 | 
			
		||||
                         out12, out13, out14, out15) {              \
 | 
			
		||||
#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,     \
 | 
			
		||||
                         r12, r13, r14, r15, out0, out1, out2, out3, out4,     \
 | 
			
		||||
                         out5, out6, out7, out8, out9, out10, out11, out12,    \
 | 
			
		||||
                         out13, out14, out15)                                  \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;                      \
 | 
			
		||||
    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;                \
 | 
			
		||||
    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;                      \
 | 
			
		||||
@@ -344,51 +347,49 @@
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);                       \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);                       \
 | 
			
		||||
    k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);                      \
 | 
			
		||||
  MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,                  \
 | 
			
		||||
          g0_m, g1_m, g2_m, g3_m);                                  \
 | 
			
		||||
    MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m);  \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);                        \
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);                       \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);                       \
 | 
			
		||||
    k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);                      \
 | 
			
		||||
  MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,                 \
 | 
			
		||||
          g4_m, g5_m, g6_m, g7_m);                                  \
 | 
			
		||||
    MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);                        \
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);                       \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);                        \
 | 
			
		||||
    k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);                       \
 | 
			
		||||
  MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,                 \
 | 
			
		||||
          g8_m, g9_m, g10_m, g11_m);                                \
 | 
			
		||||
    MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m,       \
 | 
			
		||||
            g11_m);                                                            \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);                       \
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);                      \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);                        \
 | 
			
		||||
    k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);                       \
 | 
			
		||||
  MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,                  \
 | 
			
		||||
          g12_m, g13_m, g14_m, g15_m);                              \
 | 
			
		||||
    MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m,      \
 | 
			
		||||
            g15_m);                                                            \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    /* stage 2 */                                                              \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);                        \
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);                       \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);                       \
 | 
			
		||||
  MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,          \
 | 
			
		||||
          h0_m, h1_m, h2_m, h3_m);                                  \
 | 
			
		||||
    MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
 | 
			
		||||
            h3_m);                                                             \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);                       \
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);                      \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);                      \
 | 
			
		||||
  MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,         \
 | 
			
		||||
          h4_m, h5_m, h6_m, h7_m);                                  \
 | 
			
		||||
    MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m,      \
 | 
			
		||||
            h6_m, h7_m);                                                       \
 | 
			
		||||
    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);             \
 | 
			
		||||
  BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
 | 
			
		||||
              h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
 | 
			
		||||
    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
 | 
			
		||||
                h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);                         \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    /* stage 3 */                                                              \
 | 
			
		||||
    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);           \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
 | 
			
		||||
    k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
 | 
			
		||||
    k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);                       \
 | 
			
		||||
  MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,           \
 | 
			
		||||
          out4, out6, out5, out7);                                  \
 | 
			
		||||
  MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,           \
 | 
			
		||||
          out12, out14, out13, out15);                              \
 | 
			
		||||
    MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5,  \
 | 
			
		||||
            out7);                                                             \
 | 
			
		||||
    MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14,      \
 | 
			
		||||
            out13, out15);                                                     \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    /* stage 4 */                                                              \
 | 
			
		||||
    k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                       \
 | 
			
		||||
 
 | 
			
		||||
@@ -64,17 +64,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
 | 
			
		||||
        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
 | 
			
		||||
          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
 | 
			
		||||
          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
 | 
			
		||||
          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
 | 
			
		||||
          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
 | 
			
		||||
          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
 | 
			
		||||
          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
 | 
			
		||||
          [step1_3] "=r"(step1_3)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load5],             2(%[input])                     \n\t"
 | 
			
		||||
@@ -126,17 +125,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
 | 
			
		||||
        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r" (load5), [load6] "=&r" (load6),
 | 
			
		||||
          [load7] "=&r" (load7), [load8] "=&r" (load8),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [result3] "=&r" (result3), [result4] "=&r" (result4),
 | 
			
		||||
          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
 | 
			
		||||
          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
 | 
			
		||||
          [load8] "=&r"(load8), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [result3] "=&r"(result3),
 | 
			
		||||
          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
 | 
			
		||||
          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
 | 
			
		||||
          [step2_14] "=r"(step2_14)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
 | 
			
		||||
          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             10(%[input])                    \n\t"
 | 
			
		||||
@@ -188,17 +186,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
 | 
			
		||||
        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [result3] "=&r" (result3), [result4] "=&r" (result4),
 | 
			
		||||
          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
 | 
			
		||||
          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [result3] "=&r"(result3),
 | 
			
		||||
          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
 | 
			
		||||
          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
 | 
			
		||||
          [step2_13] "=r"(step2_13)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
 | 
			
		||||
          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load5],             4(%[input])                     \n\t"
 | 
			
		||||
@@ -253,17 +250,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
 | 
			
		||||
        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r" (load5), [load6] "=&r" (load6),
 | 
			
		||||
          [load7] "=&r" (load7), [load8] "=&r" (load8),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [result3] "=&r" (result3), [result4] "=&r" (result4),
 | 
			
		||||
          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
 | 
			
		||||
          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
 | 
			
		||||
          [load8] "=&r"(load8), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [result3] "=&r"(result3),
 | 
			
		||||
          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
 | 
			
		||||
          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
 | 
			
		||||
          [step1_7] "=r"(step1_7)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
 | 
			
		||||
@@ -305,16 +301,14 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step1_11],          $ac2,           31              \n\t"
 | 
			
		||||
        "extp     %[step1_12],          $ac3,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r" (load5), [load6] "=&r" (load6),
 | 
			
		||||
          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
 | 
			
		||||
          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
 | 
			
		||||
          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
 | 
			
		||||
          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
 | 
			
		||||
          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
 | 
			
		||||
          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
 | 
			
		||||
          [step1_13] "=r"(step1_13)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
 | 
			
		||||
          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
 | 
			
		||||
          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
 | 
			
		||||
          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
 | 
			
		||||
          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
 | 
			
		||||
@@ -351,14 +345,12 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "sh       %[load6],             480(%[output])                  \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6)
 | 
			
		||||
        : [output] "r" (output),
 | 
			
		||||
          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
 | 
			
		||||
        : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
 | 
			
		||||
          [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
 | 
			
		||||
          [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
 | 
			
		||||
          [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
 | 
			
		||||
          [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
 | 
			
		||||
          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
 | 
			
		||||
@@ -387,20 +379,17 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "sh       %[load6],             416(%[output])                  \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6)
 | 
			
		||||
        : [output] "r" (output),
 | 
			
		||||
          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
 | 
			
		||||
        : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
 | 
			
		||||
          [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
 | 
			
		||||
          [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
 | 
			
		||||
          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
 | 
			
		||||
    );
 | 
			
		||||
          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
 | 
			
		||||
 | 
			
		||||
    input += 16;
 | 
			
		||||
    output += 1;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
                               int dest_stride) {
 | 
			
		||||
void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
 | 
			
		||||
  int i;
 | 
			
		||||
  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
 | 
			
		||||
  int step1_8, step1_9, step1_10, step1_11;
 | 
			
		||||
@@ -460,17 +449,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
 | 
			
		||||
        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
 | 
			
		||||
          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
 | 
			
		||||
          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
 | 
			
		||||
          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
 | 
			
		||||
          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
 | 
			
		||||
          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
 | 
			
		||||
          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
 | 
			
		||||
          [step1_3] "=r"(step1_3)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load5],             2(%[input])                     \n\t"
 | 
			
		||||
@@ -522,17 +510,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
 | 
			
		||||
        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r" (load5), [load6] "=&r" (load6),
 | 
			
		||||
          [load7] "=&r" (load7), [load8] "=&r" (load8),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [result3] "=&r" (result3), [result4] "=&r" (result4),
 | 
			
		||||
          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
 | 
			
		||||
          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
 | 
			
		||||
          [load8] "=&r"(load8), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [result3] "=&r"(result3),
 | 
			
		||||
          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
 | 
			
		||||
          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
 | 
			
		||||
          [step2_14] "=r"(step2_14)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
 | 
			
		||||
          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             10(%[input])                    \n\t"
 | 
			
		||||
@@ -584,17 +571,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
 | 
			
		||||
        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [result3] "=&r" (result3), [result4] "=&r" (result4),
 | 
			
		||||
          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
 | 
			
		||||
          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [result3] "=&r"(result3),
 | 
			
		||||
          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
 | 
			
		||||
          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
 | 
			
		||||
          [step2_13] "=r"(step2_13)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
 | 
			
		||||
          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load5],             4(%[input])                   \n\t"
 | 
			
		||||
@@ -650,17 +636,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
 | 
			
		||||
        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r" (load5), [load6] "=&r" (load6),
 | 
			
		||||
          [load7] "=&r" (load7), [load8] "=&r" (load8),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [result3] "=&r" (result3), [result4] "=&r" (result4),
 | 
			
		||||
          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
 | 
			
		||||
          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
 | 
			
		||||
          [load8] "=&r"(load8), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [result3] "=&r"(result3),
 | 
			
		||||
          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
 | 
			
		||||
          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
 | 
			
		||||
          [step1_7] "=r"(step1_7)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
 | 
			
		||||
@@ -702,16 +687,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "extp     %[step1_11],          $ac2,           31              \n\t"
 | 
			
		||||
        "extp     %[step1_12],          $ac3,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r" (load5), [load6] "=&r" (load6),
 | 
			
		||||
          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
 | 
			
		||||
          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
 | 
			
		||||
          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
 | 
			
		||||
          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
 | 
			
		||||
          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
 | 
			
		||||
          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
 | 
			
		||||
          [step1_13] "=r"(step1_13)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
 | 
			
		||||
          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
 | 
			
		||||
          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
 | 
			
		||||
          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
 | 
			
		||||
          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    step1_8 = step2_8 + step2_11;
 | 
			
		||||
    step1_9 = step2_9 + step2_10;
 | 
			
		||||
@@ -872,16 +855,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
 | 
			
		||||
          [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
 | 
			
		||||
        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
 | 
			
		||||
          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
 | 
			
		||||
          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
 | 
			
		||||
          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
 | 
			
		||||
          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
 | 
			
		||||
          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
 | 
			
		||||
        :
 | 
			
		||||
        [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
 | 
			
		||||
        [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
 | 
			
		||||
        [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
 | 
			
		||||
        [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
 | 
			
		||||
        [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
 | 
			
		||||
        [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
 | 
			
		||||
          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
 | 
			
		||||
    );
 | 
			
		||||
        [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
 | 
			
		||||
 | 
			
		||||
    input += 16;
 | 
			
		||||
  }
 | 
			
		||||
@@ -893,11 +874,7 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp    %[pos],    1    \n\t"
 | 
			
		||||
    :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // First transform rows
 | 
			
		||||
  idct16_rows_dspr2(input, out, 16);
 | 
			
		||||
@@ -914,11 +891,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp    %[pos],    1    \n\t"
 | 
			
		||||
    :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // First transform rows. Since all non-zero dct coefficients are in
 | 
			
		||||
  // upper-left 4x4 area, we only need to calculate first 4 rows here.
 | 
			
		||||
@@ -945,8 +918,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
        "sw     $zero,  480(%[outptr])     \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
        : [outptr] "r" (outptr)
 | 
			
		||||
    );
 | 
			
		||||
        : [outptr] "r"(outptr));
 | 
			
		||||
 | 
			
		||||
    outptr += 2;
 | 
			
		||||
  }
 | 
			
		||||
@@ -966,12 +938,10 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  int32_t vector_1, vector_2, vector_3, vector_4;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
@@ -979,8 +949,7 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
      "sra      %[a1],      %[out],     6       \n\t"
 | 
			
		||||
 | 
			
		||||
      : [out] "+r"(out), [a1] "=r"(a1)
 | 
			
		||||
      :
 | 
			
		||||
  );
 | 
			
		||||
      :);
 | 
			
		||||
 | 
			
		||||
  if (a1 < 0) {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
@@ -990,8 +959,7 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
 | 
			
		||||
 | 
			
		||||
        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
        : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 16; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -1013,18 +981,15 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
 | 
			
		||||
            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
 | 
			
		||||
            [dest] "+&r"(dest)
 | 
			
		||||
          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
     * input and output memory are four byte aligned */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
        "replv.qb   %[vector_a1],   %[a1]   \n\t"
 | 
			
		||||
    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
 | 
			
		||||
 | 
			
		||||
                         : [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
                         : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 16; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -1046,8 +1011,7 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
 | 
			
		||||
            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
 | 
			
		||||
            [dest] "+&r"(dest)
 | 
			
		||||
          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -1072,12 +1036,11 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
 | 
			
		||||
  int x14 = input[1];
 | 
			
		||||
  int x15 = input[14];
 | 
			
		||||
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
 | 
			
		||||
           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4]
 | 
			
		||||
              = output[5] = output[6] = output[7] = output[8]
 | 
			
		||||
              = output[9] = output[10] = output[11] = output[12]
 | 
			
		||||
              = output[13] = output[14] = output[15] = 0;
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
 | 
			
		||||
        x13 | x14 | x15)) {
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
 | 
			
		||||
        output[6] = output[7] = output[8] = output[9] = output[10] =
 | 
			
		||||
            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -1223,5 +1186,4 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
 | 
			
		||||
  output[15] = -x1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif  // HAVE_DSPR2
 | 
			
		||||
 
 | 
			
		||||
@@ -103,14 +103,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
 | 
			
		||||
          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
 | 
			
		||||
          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
 | 
			
		||||
          [step1_31] "=r"(step1_31)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
 | 
			
		||||
          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             18(%[input])                    \n\t"
 | 
			
		||||
@@ -164,14 +163,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
 | 
			
		||||
          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
 | 
			
		||||
          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
 | 
			
		||||
          [step1_29] "=r"(step1_29)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
 | 
			
		||||
          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             10(%[input])                    \n\t"
 | 
			
		||||
@@ -225,14 +223,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
 | 
			
		||||
          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
 | 
			
		||||
          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
 | 
			
		||||
          [step1_27] "=r"(step1_27)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
 | 
			
		||||
          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
 | 
			
		||||
          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             26(%[input])                    \n\t"
 | 
			
		||||
@@ -282,14 +279,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
 | 
			
		||||
          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
 | 
			
		||||
          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
 | 
			
		||||
          [step1_25] "=r"(step1_25)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
 | 
			
		||||
          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
 | 
			
		||||
          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],              4(%[input])                    \n\t"
 | 
			
		||||
@@ -339,14 +335,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
 | 
			
		||||
          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
 | 
			
		||||
          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
 | 
			
		||||
          [step2_15] "=r"(step2_15)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
 | 
			
		||||
          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
 | 
			
		||||
          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             20(%[input])                    \n\t"
 | 
			
		||||
@@ -396,14 +391,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
 | 
			
		||||
          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
 | 
			
		||||
          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
 | 
			
		||||
          [step2_13] "=r"(step2_13)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
 | 
			
		||||
          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
 | 
			
		||||
          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
 | 
			
		||||
@@ -440,17 +434,16 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "extp     %[step3_11],          $ac2,           31              \n\t"
 | 
			
		||||
        "extp     %[step3_12],          $ac3,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
 | 
			
		||||
          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
 | 
			
		||||
          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
 | 
			
		||||
          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
 | 
			
		||||
        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
 | 
			
		||||
          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
 | 
			
		||||
          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
 | 
			
		||||
          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
 | 
			
		||||
          [step3_15] "=r"(step3_15)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
 | 
			
		||||
          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
 | 
			
		||||
          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
 | 
			
		||||
          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
 | 
			
		||||
          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    step2_18 = step1_17 - step1_18;
 | 
			
		||||
    step2_29 = step1_30 - step1_29;
 | 
			
		||||
@@ -463,10 +456,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "extp     %[step3_18],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_18] "=r"(step3_18)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
 | 
			
		||||
          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
 | 
			
		||||
    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -482,10 +474,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "extp     %[step3_19],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_19] "=r"(step3_19)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
 | 
			
		||||
          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
 | 
			
		||||
    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -506,10 +497,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "extp     %[step3_20],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_20] "=r"(step3_20)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
 | 
			
		||||
          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
 | 
			
		||||
    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -525,10 +515,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "extp     %[step3_21],          $ac1,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_21] "=r"(step3_21)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
 | 
			
		||||
          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
 | 
			
		||||
    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -588,17 +577,15 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
 | 
			
		||||
        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
 | 
			
		||||
          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
 | 
			
		||||
          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
 | 
			
		||||
          [step1_3] "=r"(step1_3)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             8(%[input])                     \n\t"
 | 
			
		||||
@@ -649,17 +636,15 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
 | 
			
		||||
          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
 | 
			
		||||
          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
 | 
			
		||||
          [step1_7] "=r"(step1_7)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    step2_0 = step1_0 + step1_7;
 | 
			
		||||
    step2_1 = step1_1 + step1_6;
 | 
			
		||||
@@ -697,8 +682,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
 | 
			
		||||
          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_20 + step2_27) * cospi_16_64;
 | 
			
		||||
    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -712,8 +696,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
 | 
			
		||||
          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_21 + step2_26) * cospi_16_64;
 | 
			
		||||
    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -727,8 +710,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
 | 
			
		||||
          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_22 + step2_25) * cospi_16_64;
 | 
			
		||||
    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -742,8 +724,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
 | 
			
		||||
          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_23 + step2_24) * cospi_16_64;
 | 
			
		||||
    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -785,12 +766,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
 | 
			
		||||
        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
 | 
			
		||||
          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
 | 
			
		||||
          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
 | 
			
		||||
          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
 | 
			
		||||
          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
 | 
			
		||||
    );
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
 | 
			
		||||
          [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
 | 
			
		||||
          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
 | 
			
		||||
          [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
 | 
			
		||||
          [step2_31] "r"(step2_31));
 | 
			
		||||
 | 
			
		||||
    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
 | 
			
		||||
    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
 | 
			
		||||
@@ -824,8 +804,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
 | 
			
		||||
          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
 | 
			
		||||
          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
 | 
			
		||||
    );
 | 
			
		||||
          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
 | 
			
		||||
@@ -864,12 +843,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
 | 
			
		||||
        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
 | 
			
		||||
          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
 | 
			
		||||
          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
 | 
			
		||||
          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
 | 
			
		||||
          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
 | 
			
		||||
    );
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
 | 
			
		||||
          [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
 | 
			
		||||
          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
 | 
			
		||||
          [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
 | 
			
		||||
          [step1_27] "r"(step1_27));
 | 
			
		||||
 | 
			
		||||
    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
 | 
			
		||||
    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
 | 
			
		||||
@@ -903,8 +881,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
 | 
			
		||||
          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
 | 
			
		||||
          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
 | 
			
		||||
    );
 | 
			
		||||
          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
 | 
			
		||||
@@ -943,12 +920,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
 | 
			
		||||
        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
 | 
			
		||||
          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
 | 
			
		||||
          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
 | 
			
		||||
          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
 | 
			
		||||
          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
 | 
			
		||||
    );
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
 | 
			
		||||
          [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
 | 
			
		||||
          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
 | 
			
		||||
          [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
 | 
			
		||||
          [step1_23] "r"(step1_23));
 | 
			
		||||
 | 
			
		||||
    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
 | 
			
		||||
    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
 | 
			
		||||
@@ -982,8 +958,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
 | 
			
		||||
          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
 | 
			
		||||
          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
 | 
			
		||||
    );
 | 
			
		||||
          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
 | 
			
		||||
@@ -1025,8 +1000,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
 | 
			
		||||
          [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
 | 
			
		||||
          [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
 | 
			
		||||
          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
 | 
			
		||||
    );
 | 
			
		||||
          [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
 | 
			
		||||
 | 
			
		||||
    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
 | 
			
		||||
    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
 | 
			
		||||
@@ -1059,8 +1033,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
 | 
			
		||||
        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
 | 
			
		||||
          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
 | 
			
		||||
          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
 | 
			
		||||
    );
 | 
			
		||||
          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 | 
			
		||||
 | 
			
		||||
    input += 32;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -84,8 +84,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
          "sh     $zero,  1984(%[output])     \n\t"
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [output] "r" (output)
 | 
			
		||||
      );
 | 
			
		||||
          : [output] "r"(output));
 | 
			
		||||
 | 
			
		||||
      output += 1;
 | 
			
		||||
 | 
			
		||||
@@ -146,17 +145,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
 | 
			
		||||
          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
 | 
			
		||||
          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
 | 
			
		||||
          [step1_31] "=r"(step1_31)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
 | 
			
		||||
          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             18(%[input])                    \n\t"
 | 
			
		||||
@@ -208,17 +205,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
 | 
			
		||||
          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
 | 
			
		||||
          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
 | 
			
		||||
          [step1_29] "=r"(step1_29)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
 | 
			
		||||
          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             10(%[input])                    \n\t"
 | 
			
		||||
@@ -270,17 +265,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
 | 
			
		||||
          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
 | 
			
		||||
          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
 | 
			
		||||
          [step1_27] "=r"(step1_27)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
 | 
			
		||||
          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
 | 
			
		||||
          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             26(%[input])                    \n\t"
 | 
			
		||||
@@ -332,17 +325,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
 | 
			
		||||
          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
 | 
			
		||||
          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
 | 
			
		||||
          [step1_25] "=r"(step1_25)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
 | 
			
		||||
          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
 | 
			
		||||
          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],              4(%[input])                    \n\t"
 | 
			
		||||
@@ -394,17 +385,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
 | 
			
		||||
          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
 | 
			
		||||
          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
 | 
			
		||||
          [step2_15] "=r"(step2_15)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
 | 
			
		||||
          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
 | 
			
		||||
          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lh       %[load1],             20(%[input])                    \n\t"
 | 
			
		||||
@@ -456,17 +445,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
 | 
			
		||||
          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
 | 
			
		||||
          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
 | 
			
		||||
          [step2_13] "=r"(step2_13)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
 | 
			
		||||
          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
 | 
			
		||||
          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
 | 
			
		||||
@@ -507,18 +494,16 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step3_11],          $ac2,           31              \n\t"
 | 
			
		||||
        "extp     %[step3_12],          $ac3,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
 | 
			
		||||
          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
 | 
			
		||||
          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
 | 
			
		||||
          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
 | 
			
		||||
          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
 | 
			
		||||
          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
 | 
			
		||||
          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
 | 
			
		||||
          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
 | 
			
		||||
          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
 | 
			
		||||
          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
 | 
			
		||||
          [step3_15] "=r"(step3_15)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
 | 
			
		||||
          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
 | 
			
		||||
          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
 | 
			
		||||
          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
 | 
			
		||||
          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    step2_18 = step1_17 - step1_18;
 | 
			
		||||
    step2_29 = step1_30 - step1_29;
 | 
			
		||||
@@ -531,10 +516,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step3_18],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_18] "=r"(step3_18)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
 | 
			
		||||
          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
 | 
			
		||||
    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -550,10 +534,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step3_19],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_19] "=r"(step3_19)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
 | 
			
		||||
          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
 | 
			
		||||
    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -574,10 +557,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step3_20],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_20] "=r"(step3_20)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
 | 
			
		||||
          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
 | 
			
		||||
    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -593,10 +575,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step3_21],          $ac1,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [step3_21] "=r"(step3_21)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
 | 
			
		||||
          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
 | 
			
		||||
    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -658,16 +639,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
 | 
			
		||||
        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [result1] "=&r" (result1), [result2] "=&r" (result2),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
 | 
			
		||||
          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [result1] "=&r"(result1),
 | 
			
		||||
          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
 | 
			
		||||
          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
 | 
			
		||||
          [step1_3] "=r"(step1_3)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64)
 | 
			
		||||
 | 
			
		||||
            );
 | 
			
		||||
 | 
			
		||||
@@ -724,17 +704,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
 | 
			
		||||
        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
 | 
			
		||||
 | 
			
		||||
        : [load1] "=&r" (load1), [load2] "=&r" (load2),
 | 
			
		||||
          [load3] "=&r" (load3), [load4] "=&r" (load4),
 | 
			
		||||
          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
 | 
			
		||||
          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
 | 
			
		||||
          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
 | 
			
		||||
          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
 | 
			
		||||
        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
 | 
			
		||||
          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
 | 
			
		||||
          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
 | 
			
		||||
          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
 | 
			
		||||
          [step1_7] "=r"(step1_7)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
 | 
			
		||||
          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    step2_0 = step1_0 + step1_7;
 | 
			
		||||
    step2_1 = step1_1 + step1_6;
 | 
			
		||||
@@ -770,10 +748,8 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step1_20],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
 | 
			
		||||
          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_20 + step2_27) * cospi_16_64;
 | 
			
		||||
    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -786,10 +762,8 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step1_21],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
 | 
			
		||||
          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_21 + step2_26) * cospi_16_64;
 | 
			
		||||
    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -802,10 +776,8 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step1_22],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
 | 
			
		||||
          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_22 + step2_25) * cospi_16_64;
 | 
			
		||||
    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -818,10 +790,8 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 | 
			
		||||
        "extp     %[step1_23],          $ac0,           31              \n\t"
 | 
			
		||||
 | 
			
		||||
        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
 | 
			
		||||
        : [const_2_power_13] "r" (const_2_power_13),
 | 
			
		||||
          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
 | 
			
		||||
          [cospi_16_64] "r" (cospi_16_64)
 | 
			
		||||
    );
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
 | 
			
		||||
          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
 | 
			
		||||
 | 
			
		||||
    temp21 = (step2_23 + step2_24) * cospi_16_64;
 | 
			
		||||
    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 | 
			
		||||
@@ -872,11 +842,9 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // Rows
 | 
			
		||||
  idct32_rows_dspr2(input, outptr, 32);
 | 
			
		||||
@@ -893,11 +861,9 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // Rows
 | 
			
		||||
  idct32_rows_dspr2(input, outptr, 8);
 | 
			
		||||
@@ -918,8 +884,7 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
      "sw     $zero,     44(%[outptr])     \n\t"
 | 
			
		||||
 | 
			
		||||
      :
 | 
			
		||||
      : [outptr] "r" (outptr)
 | 
			
		||||
  );
 | 
			
		||||
      : [outptr] "r"(outptr));
 | 
			
		||||
 | 
			
		||||
  for (i = 0; i < 31; ++i) {
 | 
			
		||||
    outptr += 32;
 | 
			
		||||
@@ -939,8 +904,7 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
        "sw     $zero,     44(%[outptr])     \n\t"
 | 
			
		||||
 | 
			
		||||
        :
 | 
			
		||||
        : [outptr] "r" (outptr)
 | 
			
		||||
    );
 | 
			
		||||
        : [outptr] "r"(outptr));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Columns
 | 
			
		||||
@@ -957,12 +921,10 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
@@ -970,8 +932,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
      "sra      %[a1],     %[out],    6       \n\t"
 | 
			
		||||
 | 
			
		||||
      : [out] "+r"(out), [a1] "=r"(a1)
 | 
			
		||||
      :
 | 
			
		||||
  );
 | 
			
		||||
      :);
 | 
			
		||||
 | 
			
		||||
  if (a1 < 0) {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
@@ -981,8 +942,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
        "replv.qb   %[vector_a1], %[absa1]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
        : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 32; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -1018,18 +978,15 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
 | 
			
		||||
            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
 | 
			
		||||
            [dest] "+&r"(dest)
 | 
			
		||||
          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
     * input and output memory are four byte aligned */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
        "replv.qb       %[vector_a1],   %[a1]     \n\t"
 | 
			
		||||
    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
 | 
			
		||||
 | 
			
		||||
                         : [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
                         : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 32; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -1065,8 +1022,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
 | 
			
		||||
            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
 | 
			
		||||
            [dest] "+&r"(dest)
 | 
			
		||||
          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -83,16 +83,12 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
 | 
			
		||||
        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
 | 
			
		||||
        "sh       %[Temp3],             24(%[output])                   \n\t"
 | 
			
		||||
 | 
			
		||||
      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
 | 
			
		||||
        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
 | 
			
		||||
        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
 | 
			
		||||
        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
 | 
			
		||||
        [output] "+r" (output)
 | 
			
		||||
        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
 | 
			
		||||
          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
 | 
			
		||||
        [cospi_24_64] "r" (cospi_24_64),
 | 
			
		||||
        [input] "r" (input)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
 | 
			
		||||
 | 
			
		||||
    input += 4;
 | 
			
		||||
    output += 1;
 | 
			
		||||
@@ -206,16 +202,14 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
 | 
			
		||||
        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
 | 
			
		||||
 | 
			
		||||
      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
 | 
			
		||||
        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
 | 
			
		||||
        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
 | 
			
		||||
        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
 | 
			
		||||
          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
 | 
			
		||||
          [dest_pix] "+r"(dest_pix)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13),
 | 
			
		||||
          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
 | 
			
		||||
        [cospi_24_64] "r" (cospi_24_64),
 | 
			
		||||
        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
 | 
			
		||||
          [dest_stride] "r"(dest_stride));
 | 
			
		||||
 | 
			
		||||
    input += 4;
 | 
			
		||||
  }
 | 
			
		||||
@@ -228,11 +222,9 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // Rows
 | 
			
		||||
  vpx_idct4_rows_dspr2(input, outptr);
 | 
			
		||||
@@ -251,12 +243,10 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  int16_t input_dc = input[0];
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
@@ -264,8 +254,7 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
      "sra      %[a1],      %[out],    4       \n\t"
 | 
			
		||||
 | 
			
		||||
      : [out] "+r"(out), [a1] "=r"(a1)
 | 
			
		||||
      :
 | 
			
		||||
  );
 | 
			
		||||
      :);
 | 
			
		||||
 | 
			
		||||
  if (a1 < 0) {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
@@ -275,8 +264,7 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
        "replv.qb   %[vector_a1], %[absa1]      \n\t"
 | 
			
		||||
 | 
			
		||||
        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
        : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 4; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -285,19 +273,15 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
          "sw             %[vector_a],    0(%[dest])                      \n\t"
 | 
			
		||||
          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
 | 
			
		||||
 | 
			
		||||
          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
 | 
			
		||||
            [dest] "+&r" (dest)
 | 
			
		||||
          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
 | 
			
		||||
          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
     * input and output memory are four byte aligned */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
        "replv.qb       %[vector_a1],   %[a1]     \n\t"
 | 
			
		||||
    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
 | 
			
		||||
                         : [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
                         : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 4; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -306,10 +290,8 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
          "sw           %[vector_a],    0(%[dest])                        \n\t"
 | 
			
		||||
          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
 | 
			
		||||
 | 
			
		||||
          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
 | 
			
		||||
            [dest] "+&r" (dest)
 | 
			
		||||
          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
 | 
			
		||||
          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -178,16 +178,14 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
 | 
			
		||||
          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
 | 
			
		||||
          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
 | 
			
		||||
          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
 | 
			
		||||
          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
 | 
			
		||||
          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
 | 
			
		||||
          [Temp4] "=&r" (Temp4)
 | 
			
		||||
          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13),
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
 | 
			
		||||
          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64),
 | 
			
		||||
          [output] "r" (output), [input] "r" (input)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
 | 
			
		||||
          [input] "r"(input));
 | 
			
		||||
 | 
			
		||||
    input += 8;
 | 
			
		||||
    output += 1;
 | 
			
		||||
@@ -427,16 +425,14 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
 | 
			
		||||
          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
 | 
			
		||||
          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
 | 
			
		||||
          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
 | 
			
		||||
          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
 | 
			
		||||
          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
 | 
			
		||||
          [dest_pix] "+r" (dest_pix)
 | 
			
		||||
          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
 | 
			
		||||
          [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
 | 
			
		||||
        : [const_2_power_13] "r"(const_2_power_13),
 | 
			
		||||
          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
 | 
			
		||||
          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
 | 
			
		||||
          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
 | 
			
		||||
          [cospi_24_64] "r" (cospi_24_64),
 | 
			
		||||
          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
 | 
			
		||||
    );
 | 
			
		||||
          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
 | 
			
		||||
          [dest_stride] "r"(dest_stride));
 | 
			
		||||
 | 
			
		||||
    input += 8;
 | 
			
		||||
  }
 | 
			
		||||
@@ -449,11 +445,7 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp    %[pos],    1    \n\t"
 | 
			
		||||
    :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // First transform rows
 | 
			
		||||
  idct8_rows_dspr2(input, outptr, 8);
 | 
			
		||||
@@ -469,11 +461,7 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  uint32_t pos = 45;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp    %[pos],    1    \n\t"
 | 
			
		||||
    :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  // First transform rows
 | 
			
		||||
  idct8_rows_dspr2(input, outptr, 4);
 | 
			
		||||
@@ -499,9 +487,7 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
      "sw  $zero, 116(%[outptr])  \n\t"
 | 
			
		||||
 | 
			
		||||
      :
 | 
			
		||||
      : [outptr] "r" (outptr)
 | 
			
		||||
  );
 | 
			
		||||
 | 
			
		||||
      : [outptr] "r"(outptr));
 | 
			
		||||
 | 
			
		||||
  // Then transform columns and add to dest
 | 
			
		||||
  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
 | 
			
		||||
@@ -516,12 +502,10 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
  int32_t t1, t2, vector_a1, vector_1, vector_2;
 | 
			
		||||
 | 
			
		||||
  /* bit positon for extract from acc */
 | 
			
		||||
  __asm__ __volatile__ (
 | 
			
		||||
    "wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 | 
			
		||||
 | 
			
		||||
                       :
 | 
			
		||||
    : [pos] "r" (pos)
 | 
			
		||||
  );
 | 
			
		||||
                       : [pos] "r"(pos));
 | 
			
		||||
 | 
			
		||||
  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
@@ -529,8 +513,7 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
      "sra      %[a1],      %[out],     5       \n\t"
 | 
			
		||||
 | 
			
		||||
      : [out] "+r"(out), [a1] "=r"(a1)
 | 
			
		||||
      :
 | 
			
		||||
  );
 | 
			
		||||
      :);
 | 
			
		||||
 | 
			
		||||
  if (a1 < 0) {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
@@ -540,8 +523,7 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
 | 
			
		||||
 | 
			
		||||
        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
        : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 8; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -553,21 +535,17 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
          "sw           %[vector_2],    4(%[dest])                      \n\t"
 | 
			
		||||
          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
 | 
			
		||||
 | 
			
		||||
          : [t1] "=&r" (t1), [t2] "=&r" (t2),
 | 
			
		||||
            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
 | 
			
		||||
            [dest] "+&r" (dest)
 | 
			
		||||
          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
 | 
			
		||||
            [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
 | 
			
		||||
          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    /* use quad-byte
 | 
			
		||||
     * input and output memory are four byte aligned */
 | 
			
		||||
    __asm__ __volatile__ (
 | 
			
		||||
        "replv.qb   %[vector_a1],   %[a1]   \n\t"
 | 
			
		||||
    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
 | 
			
		||||
 | 
			
		||||
                         : [vector_a1] "=r"(vector_a1)
 | 
			
		||||
        : [a1] "r" (a1)
 | 
			
		||||
    );
 | 
			
		||||
                         : [a1] "r"(a1));
 | 
			
		||||
 | 
			
		||||
    for (r = 8; r--;) {
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -579,11 +557,9 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
 | 
			
		||||
          "sw           %[vector_2],    4(%[dest])                      \n\t"
 | 
			
		||||
          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
 | 
			
		||||
 | 
			
		||||
          : [t1] "=&r" (t1), [t2] "=&r" (t2),
 | 
			
		||||
            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
 | 
			
		||||
            [dest] "+r" (dest)
 | 
			
		||||
          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
 | 
			
		||||
      );
 | 
			
		||||
          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
 | 
			
		||||
            [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
 | 
			
		||||
          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -602,8 +578,8 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
 | 
			
		||||
  x7 = input[6];
 | 
			
		||||
 | 
			
		||||
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4]
 | 
			
		||||
              = output[5] = output[6] = output[7] = 0;
 | 
			
		||||
    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
 | 
			
		||||
        output[6] = output[7] = 0;
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -11,8 +11,7 @@
 | 
			
		||||
#include "vpx_ports/mem.h"
 | 
			
		||||
#include "vpx_dsp/mips/loopfilter_msa.h"
 | 
			
		||||
 | 
			
		||||
int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
 | 
			
		||||
                                 uint8_t *filter48,
 | 
			
		||||
int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
 | 
			
		||||
                                 const uint8_t *b_limit_ptr,
 | 
			
		||||
                                 const uint8_t *limit_ptr,
 | 
			
		||||
                                 const uint8_t *thresh_ptr) {
 | 
			
		||||
@@ -33,8 +32,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  /* mask and hev */
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 | 
			
		||||
 | 
			
		||||
@@ -43,9 +42,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
 | 
			
		||||
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 | 
			
		||||
               q2_r, q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 | 
			
		||||
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 | 
			
		||||
 | 
			
		||||
@@ -107,8 +105,8 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
 | 
			
		||||
  } else {
 | 
			
		||||
    src -= 7 * pitch;
 | 
			
		||||
 | 
			
		||||
    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
 | 
			
		||||
               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
 | 
			
		||||
    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
 | 
			
		||||
               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
 | 
			
		||||
               p2_r_in, p1_r_in, p0_r_in);
 | 
			
		||||
 | 
			
		||||
    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
 | 
			
		||||
@@ -408,8 +406,7 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
 | 
			
		||||
void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
                                    const uint8_t *b_limit_ptr,
 | 
			
		||||
                                    const uint8_t *limit_ptr,
 | 
			
		||||
                                    const uint8_t *thresh_ptr,
 | 
			
		||||
                                    int32_t count) {
 | 
			
		||||
                                    const uint8_t *thresh_ptr, int32_t count) {
 | 
			
		||||
  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
 | 
			
		||||
  uint8_t early_exit = 0;
 | 
			
		||||
 | 
			
		||||
@@ -426,8 +423,7 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
 | 
			
		||||
                                   const uint8_t *b_limit_ptr,
 | 
			
		||||
                                   const uint8_t *limit_ptr,
 | 
			
		||||
                                   const uint8_t *thresh_ptr,
 | 
			
		||||
                                   int32_t count) {
 | 
			
		||||
                                   const uint8_t *thresh_ptr, int32_t count) {
 | 
			
		||||
  if (1 == count) {
 | 
			
		||||
    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
 | 
			
		||||
    uint64_t dword0, dword1;
 | 
			
		||||
@@ -449,8 +445,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
 | 
			
		||||
    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
 | 
			
		||||
    limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
                 hev, mask, flat);
 | 
			
		||||
    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
                 mask, flat);
 | 
			
		||||
    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
 | 
			
		||||
                       q1_out);
 | 
			
		||||
@@ -472,9 +468,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
 | 
			
		||||
                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
 | 
			
		||||
 | 
			
		||||
      /* convert 16 bit output data into 8 bit */
 | 
			
		||||
      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
 | 
			
		||||
                  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
 | 
			
		||||
                  q0_filter8);
 | 
			
		||||
      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
 | 
			
		||||
                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
 | 
			
		||||
      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
 | 
			
		||||
 | 
			
		||||
      /* store pixel values */
 | 
			
		||||
@@ -668,8 +663,8 @@ static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
 | 
			
		||||
  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 | 
			
		||||
  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
 | 
			
		||||
 | 
			
		||||
  LD_UB8(input, in_pitch,
 | 
			
		||||
         p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
 | 
			
		||||
  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
 | 
			
		||||
         p1_org, p0_org);
 | 
			
		||||
  /* 8x8 transpose */
 | 
			
		||||
  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
 | 
			
		||||
                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
 | 
			
		||||
@@ -699,8 +694,8 @@ static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
 | 
			
		||||
  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void transpose_16x16(uint8_t *input, int32_t in_pitch,
 | 
			
		||||
                            uint8_t *output, int32_t out_pitch) {
 | 
			
		||||
static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
 | 
			
		||||
                            int32_t out_pitch) {
 | 
			
		||||
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 | 
			
		||||
  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
 | 
			
		||||
  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
 | 
			
		||||
@@ -709,12 +704,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch,
 | 
			
		||||
 | 
			
		||||
  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 | 
			
		||||
  input += (8 * in_pitch);
 | 
			
		||||
  LD_UB8(input, in_pitch,
 | 
			
		||||
         row8, row9, row10, row11, row12, row13, row14, row15);
 | 
			
		||||
  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
 | 
			
		||||
                      row8, row9, row10, row11, row12, row13, row14, row15,
 | 
			
		||||
                      p7, p6, p5, p4, p3, p2, p1, p0);
 | 
			
		||||
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
 | 
			
		||||
                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
 | 
			
		||||
                      p5, p4, p3, p2, p1, p0);
 | 
			
		||||
 | 
			
		||||
  /* transpose 16x8 matrix into 8x16 */
 | 
			
		||||
  /* total 8 intermediate register and 32 instructions */
 | 
			
		||||
@@ -779,8 +773,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  /* mask and hev */
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  /* flat4 */
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  /* filter4 */
 | 
			
		||||
@@ -794,9 +788,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
 | 
			
		||||
    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 | 
			
		||||
               q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 | 
			
		||||
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 | 
			
		||||
 | 
			
		||||
@@ -864,9 +857,9 @@ int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
 | 
			
		||||
  } else {
 | 
			
		||||
    src -= 7 * 16;
 | 
			
		||||
 | 
			
		||||
    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
 | 
			
		||||
               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
 | 
			
		||||
               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
 | 
			
		||||
    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
 | 
			
		||||
               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
 | 
			
		||||
               p2_r_in, p1_r_in, p0_r_in);
 | 
			
		||||
    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
 | 
			
		||||
 | 
			
		||||
    tmp0_r = p7_r_in << 3;
 | 
			
		||||
@@ -1056,9 +1049,9 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
 | 
			
		||||
  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 | 
			
		||||
 | 
			
		||||
  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
 | 
			
		||||
                                       &filter48[0], src, pitch, b_limit_ptr,
 | 
			
		||||
                                       limit_ptr, thresh_ptr);
 | 
			
		||||
  early_exit =
 | 
			
		||||
      vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
 | 
			
		||||
                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 | 
			
		||||
 | 
			
		||||
  if (0 == early_exit) {
 | 
			
		||||
    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
 | 
			
		||||
@@ -1093,8 +1086,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  /* mask and hev */
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  /* flat4 */
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  /* filter4 */
 | 
			
		||||
@@ -1113,9 +1106,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
 | 
			
		||||
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 | 
			
		||||
               q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 | 
			
		||||
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 | 
			
		||||
    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
 | 
			
		||||
@@ -1196,9 +1188,9 @@ int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
 | 
			
		||||
  } else {
 | 
			
		||||
    src -= 7 * 16;
 | 
			
		||||
 | 
			
		||||
    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
 | 
			
		||||
               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
 | 
			
		||||
               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
 | 
			
		||||
    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
 | 
			
		||||
               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
 | 
			
		||||
               p2_r_in, p1_r_in, p0_r_in);
 | 
			
		||||
    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
 | 
			
		||||
 | 
			
		||||
    tmp0_r = p7_r_in << 3;
 | 
			
		||||
@@ -1479,9 +1471,9 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
 | 
			
		||||
  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 | 
			
		||||
 | 
			
		||||
  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
 | 
			
		||||
                                        &filter48[0], src, pitch, b_limit_ptr,
 | 
			
		||||
                                        limit_ptr, thresh_ptr);
 | 
			
		||||
  early_exit =
 | 
			
		||||
      vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
 | 
			
		||||
                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 | 
			
		||||
 | 
			
		||||
  if (0 == early_exit) {
 | 
			
		||||
    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
 | 
			
		||||
 
 | 
			
		||||
@@ -25,8 +25,8 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 | 
			
		||||
 | 
			
		||||
  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
 | 
			
		||||
@@ -61,8 +61,8 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
 | 
			
		||||
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
 | 
			
		||||
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
 | 
			
		||||
 | 
			
		||||
  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
 | 
			
		||||
@@ -82,10 +82,10 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
 | 
			
		||||
                     p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
 | 
			
		||||
                     q3);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
 | 
			
		||||
  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
 | 
			
		||||
  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
 | 
			
		||||
@@ -111,12 +111,12 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 | 
			
		||||
 | 
			
		||||
  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
 | 
			
		||||
  LD_UB8(src - 4 + (8 * pitch), pitch,
 | 
			
		||||
         row8, row9, row10, row11, row12, row13, row14, row15);
 | 
			
		||||
  LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
 | 
			
		||||
         row14, row15);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
 | 
			
		||||
                      row8, row9, row10, row11, row12, row13, row14, row15,
 | 
			
		||||
                      p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
 | 
			
		||||
                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
 | 
			
		||||
                      p1, p0, q0, q1, q2, q3);
 | 
			
		||||
 | 
			
		||||
  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
 | 
			
		||||
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
 | 
			
		||||
@@ -130,8 +130,8 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
 | 
			
		||||
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
 | 
			
		||||
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
 | 
			
		||||
  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
 | 
			
		||||
  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
 | 
			
		||||
 
 | 
			
		||||
@@ -29,8 +29,8 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 | 
			
		||||
 | 
			
		||||
@@ -43,16 +43,14 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
 | 
			
		||||
    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 | 
			
		||||
               q2_r, q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
 | 
			
		||||
                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
 | 
			
		||||
 | 
			
		||||
    /* convert 16 bit output data into 8 bit */
 | 
			
		||||
    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
 | 
			
		||||
                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
 | 
			
		||||
                q0_filter8);
 | 
			
		||||
    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
 | 
			
		||||
                q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
 | 
			
		||||
    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
 | 
			
		||||
 | 
			
		||||
    /* store pixel values */
 | 
			
		||||
@@ -80,12 +78,9 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
                                   const uint8_t *b_limit0,
 | 
			
		||||
                                   const uint8_t *limit0,
 | 
			
		||||
                                   const uint8_t *thresh0,
 | 
			
		||||
                                   const uint8_t *b_limit1,
 | 
			
		||||
                                   const uint8_t *limit1,
 | 
			
		||||
void vpx_lpf_horizontal_8_dual_msa(
 | 
			
		||||
    uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
 | 
			
		||||
    const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
 | 
			
		||||
    const uint8_t *thresh1) {
 | 
			
		||||
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
 | 
			
		||||
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
 | 
			
		||||
@@ -112,17 +107,16 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
 | 
			
		||||
 | 
			
		||||
  /* mask and hev */
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 | 
			
		||||
 | 
			
		||||
  if (__msa_test_bz_v(flat)) {
 | 
			
		||||
    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
 | 
			
		||||
               q2_r, q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 | 
			
		||||
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 | 
			
		||||
 | 
			
		||||
@@ -170,16 +164,16 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  /* load vector elements */
 | 
			
		||||
  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
 | 
			
		||||
  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
 | 
			
		||||
                     p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
 | 
			
		||||
                     q3);
 | 
			
		||||
 | 
			
		||||
  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
 | 
			
		||||
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
 | 
			
		||||
  limit = (v16u8)__msa_fill_b(*limit_ptr);
 | 
			
		||||
 | 
			
		||||
  /* mask and hev */
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  /* flat4 */
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  /* filter4 */
 | 
			
		||||
@@ -197,9 +191,8 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
    src += 4 * pitch;
 | 
			
		||||
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 | 
			
		||||
               q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 | 
			
		||||
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 | 
			
		||||
    /* convert 16 bit output data into 8 bit */
 | 
			
		||||
@@ -232,11 +225,9 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
                                 const uint8_t *b_limit0,
 | 
			
		||||
                                 const uint8_t *limit0,
 | 
			
		||||
                                 const uint8_t *b_limit0, const uint8_t *limit0,
 | 
			
		||||
                                 const uint8_t *thresh0,
 | 
			
		||||
                                 const uint8_t *b_limit1,
 | 
			
		||||
                                 const uint8_t *limit1,
 | 
			
		||||
                                 const uint8_t *b_limit1, const uint8_t *limit1,
 | 
			
		||||
                                 const uint8_t *thresh1) {
 | 
			
		||||
  uint8_t *temp_src;
 | 
			
		||||
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
 | 
			
		||||
@@ -257,9 +248,9 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
 | 
			
		||||
 | 
			
		||||
  /* transpose 16x8 matrix into 8x16 */
 | 
			
		||||
  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
 | 
			
		||||
                      q3, q2, q1, q0, row12, row13, row14, row15,
 | 
			
		||||
                      p3, p2, p1, p0, q0, q1, q2, q3);
 | 
			
		||||
  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
 | 
			
		||||
                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
 | 
			
		||||
                      q3);
 | 
			
		||||
 | 
			
		||||
  thresh = (v16u8)__msa_fill_b(*thresh0);
 | 
			
		||||
  vec0 = (v8i16)__msa_fill_b(*thresh1);
 | 
			
		||||
@@ -274,8 +265,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
 | 
			
		||||
 | 
			
		||||
  /* mask and hev */
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
 | 
			
		||||
               hev, mask, flat);
 | 
			
		||||
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
 | 
			
		||||
               mask, flat);
 | 
			
		||||
  /* flat4 */
 | 
			
		||||
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
 | 
			
		||||
  /* filter4 */
 | 
			
		||||
@@ -292,9 +283,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
 | 
			
		||||
    src += 8 * pitch;
 | 
			
		||||
    ST4x8_UB(vec4, vec5, src, pitch);
 | 
			
		||||
  } else {
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
 | 
			
		||||
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
 | 
			
		||||
               q3_r);
 | 
			
		||||
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
 | 
			
		||||
               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
 | 
			
		||||
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
 | 
			
		||||
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -19,10 +19,8 @@
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
 | 
			
		||||
                                int pitch,
 | 
			
		||||
                                const uint8_t *blimit,
 | 
			
		||||
                                const uint8_t *limit,
 | 
			
		||||
void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
 | 
			
		||||
                                const uint8_t *blimit, const uint8_t *limit,
 | 
			
		||||
                                const uint8_t *thresh) {
 | 
			
		||||
  uint8_t i;
 | 
			
		||||
  uint32_t mask;
 | 
			
		||||
@@ -44,8 +42,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
 | 
			
		||||
        [limit_vec] "=r"(limit_vec)
 | 
			
		||||
      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 | 
			
		||||
 | 
			
		||||
  /* prefetch data for store */
 | 
			
		||||
  prefetch_store(s);
 | 
			
		||||
@@ -69,8 +66,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
 | 
			
		||||
        "lw     %[p4],  (%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
 | 
			
		||||
        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
			
		||||
    );
 | 
			
		||||
        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 | 
			
		||||
 | 
			
		||||
    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
 | 
			
		||||
       mask will be zero and filtering is not needed */
 | 
			
		||||
@@ -81,14 +77,11 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
 | 
			
		||||
          "lw       %[p5],  (%[s5])    \n\t"
 | 
			
		||||
          "lw       %[p6],  (%[s6])    \n\t"
 | 
			
		||||
 | 
			
		||||
          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
 | 
			
		||||
            [p6] "=&r" (p6)
 | 
			
		||||
          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
 | 
			
		||||
      );
 | 
			
		||||
          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
 | 
			
		||||
          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
 | 
			
		||||
 | 
			
		||||
      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
 | 
			
		||||
                            pm1, p0, p3, p4, p5, p6,
 | 
			
		||||
                            thresh_vec, &hev, &mask);
 | 
			
		||||
      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
 | 
			
		||||
                            p6, thresh_vec, &hev, &mask);
 | 
			
		||||
 | 
			
		||||
      /* if mask == 0 do filtering is not needed */
 | 
			
		||||
      if (mask) {
 | 
			
		||||
@@ -103,8 +96,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
 | 
			
		||||
              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -112,10 +104,8 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
                              int pitch,
 | 
			
		||||
                              const uint8_t *blimit,
 | 
			
		||||
                              const uint8_t *limit,
 | 
			
		||||
void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
 | 
			
		||||
                              const uint8_t *blimit, const uint8_t *limit,
 | 
			
		||||
                              const uint8_t *thresh) {
 | 
			
		||||
  uint8_t i;
 | 
			
		||||
  uint32_t mask, hev;
 | 
			
		||||
@@ -137,8 +127,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
 | 
			
		||||
        [limit_vec] "=r"(limit_vec)
 | 
			
		||||
      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 | 
			
		||||
 | 
			
		||||
  /* prefetch data for store */
 | 
			
		||||
  prefetch_store(s + pitch);
 | 
			
		||||
@@ -179,12 +168,10 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
        "append         %[p1],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[pm1],     %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
 | 
			
		||||
          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    /* transpose p3, p4, p5, p6 */
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
@@ -203,20 +190,17 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
        "append         %[p5],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[p3],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
 | 
			
		||||
          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
 | 
			
		||||
     * mask will be zero and filtering is not needed
 | 
			
		||||
     */
 | 
			
		||||
    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
 | 
			
		||||
      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
 | 
			
		||||
                            p0, p3, p4, p5, p6, thresh_vec,
 | 
			
		||||
                            &hev, &mask);
 | 
			
		||||
      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
 | 
			
		||||
                            p6, thresh_vec, &hev, &mask);
 | 
			
		||||
 | 
			
		||||
      /* if mask == 0 do filtering is not needed */
 | 
			
		||||
      if (mask) {
 | 
			
		||||
@@ -235,8 +219,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
              [s4] "r"(s4));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "srl    %[p4],  %[p4],  8     \n\t"
 | 
			
		||||
@@ -245,8 +228,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
            "srl    %[p1],  %[p1],  8     \n\t"
 | 
			
		||||
 | 
			
		||||
            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
 | 
			
		||||
            :
 | 
			
		||||
        );
 | 
			
		||||
            :);
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p4],   1(%[s3])    \n\t"
 | 
			
		||||
@@ -255,8 +237,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
            "sb     %[p1],  -2(%[s3])    \n\t"
 | 
			
		||||
 | 
			
		||||
            : [p1] "+r"(p1)
 | 
			
		||||
            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "srl    %[p4],  %[p4],  8     \n\t"
 | 
			
		||||
@@ -265,8 +246,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
            "srl    %[p1],  %[p1],  8     \n\t"
 | 
			
		||||
 | 
			
		||||
            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
 | 
			
		||||
            :
 | 
			
		||||
        );
 | 
			
		||||
            :);
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p4],   1(%[s2])    \n\t"
 | 
			
		||||
@@ -276,8 +256,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
 | 
			
		||||
              [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
              [s2] "r"(s2));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "srl    %[p4],  %[p4],  8     \n\t"
 | 
			
		||||
@@ -286,8 +265,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
            "srl    %[p1],  %[p1],  8     \n\t"
 | 
			
		||||
 | 
			
		||||
            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
 | 
			
		||||
            :
 | 
			
		||||
        );
 | 
			
		||||
            :);
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p4],   1(%[s1])    \n\t"
 | 
			
		||||
@@ -297,37 +275,29 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [s1] "r"(s1));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
 | 
			
		||||
                                     const uint8_t *blimit0,
 | 
			
		||||
                                     const uint8_t *limit0,
 | 
			
		||||
                                     const uint8_t *thresh0,
 | 
			
		||||
                                     const uint8_t *blimit1,
 | 
			
		||||
                                     const uint8_t *limit1,
 | 
			
		||||
                                     const uint8_t *thresh1) {
 | 
			
		||||
void vpx_lpf_horizontal_4_dual_dspr2(
 | 
			
		||||
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
 | 
			
		||||
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
 | 
			
		||||
    const uint8_t *limit1, const uint8_t *thresh1) {
 | 
			
		||||
  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
 | 
			
		||||
  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
 | 
			
		||||
                                     const uint8_t *blimit0,
 | 
			
		||||
                                     const uint8_t *limit0,
 | 
			
		||||
                                     const uint8_t *thresh0,
 | 
			
		||||
                                     const uint8_t *blimit1,
 | 
			
		||||
                                     const uint8_t *limit1,
 | 
			
		||||
                                     const uint8_t *thresh1) {
 | 
			
		||||
void vpx_lpf_horizontal_8_dual_dspr2(
 | 
			
		||||
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
 | 
			
		||||
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
 | 
			
		||||
    const uint8_t *limit1, const uint8_t *thresh1) {
 | 
			
		||||
  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
 | 
			
		||||
  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
 | 
			
		||||
                                   const uint8_t *blimit0,
 | 
			
		||||
void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
 | 
			
		||||
                                   const uint8_t *limit0,
 | 
			
		||||
                                   const uint8_t *thresh0,
 | 
			
		||||
                                   const uint8_t *blimit1,
 | 
			
		||||
@@ -337,8 +307,7 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
 | 
			
		||||
  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
 | 
			
		||||
                                   const uint8_t *blimit0,
 | 
			
		||||
void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
 | 
			
		||||
                                   const uint8_t *limit0,
 | 
			
		||||
                                   const uint8_t *thresh0,
 | 
			
		||||
                                   const uint8_t *blimit1,
 | 
			
		||||
@@ -348,8 +317,7 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
 | 
			
		||||
  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
 | 
			
		||||
                                    const uint8_t *blimit,
 | 
			
		||||
void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
 | 
			
		||||
                                    const uint8_t *limit,
 | 
			
		||||
                                    const uint8_t *thresh) {
 | 
			
		||||
  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
 | 
			
		||||
 
 | 
			
		||||
@@ -24,9 +24,8 @@ extern "C" {
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
/* inputs & outputs are quad-byte vectors */
 | 
			
		||||
static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
                                uint32_t *ps1, uint32_t *ps0,
 | 
			
		||||
                                uint32_t *qs0, uint32_t *qs1) {
 | 
			
		||||
static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
 | 
			
		||||
                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
 | 
			
		||||
  int32_t vpx_filter_l, vpx_filter_r;
 | 
			
		||||
  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
 | 
			
		||||
  int32_t subr_r, subr_l;
 | 
			
		||||
@@ -99,17 +98,14 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
 | 
			
		||||
      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 | 
			
		||||
 | 
			
		||||
      : [vpx_filter_l] "=&r" (vpx_filter_l),
 | 
			
		||||
        [vpx_filter_r] "=&r" (vpx_filter_r),
 | 
			
		||||
      : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
 | 
			
		||||
        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
 | 
			
		||||
        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
 | 
			
		||||
      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
 | 
			
		||||
        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
 | 
			
		||||
        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
 | 
			
		||||
        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
 | 
			
		||||
        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
 | 
			
		||||
        [HWM] "r" (HWM)
 | 
			
		||||
  );
 | 
			
		||||
        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
 | 
			
		||||
        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
 | 
			
		||||
        [HWM] "r"(HWM));
 | 
			
		||||
 | 
			
		||||
  /* save bottom 3 bits so that we round one side +4 and the other +3 */
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
@@ -139,11 +135,10 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
 | 
			
		||||
      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
 | 
			
		||||
        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
 | 
			
		||||
        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
 | 
			
		||||
        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
 | 
			
		||||
        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
 | 
			
		||||
        [vqs0_r] "+r"(vqs0_r)
 | 
			
		||||
      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
 | 
			
		||||
        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
 | 
			
		||||
  );
 | 
			
		||||
        [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
 | 
			
		||||
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
      /* (vpx_filter += 1) >>= 1 */
 | 
			
		||||
@@ -163,10 +158,9 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 | 
			
		||||
 | 
			
		||||
      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
 | 
			
		||||
        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
 | 
			
		||||
        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
 | 
			
		||||
      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
 | 
			
		||||
  );
 | 
			
		||||
        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
 | 
			
		||||
        [vqs1_r] "+r"(vqs1_r)
 | 
			
		||||
      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
 | 
			
		||||
 | 
			
		||||
  /* Create quad-bytes from halfword pairs */
 | 
			
		||||
  vqs0_l = vqs0_l & HWM;
 | 
			
		||||
@@ -180,10 +174,9 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
 | 
			
		||||
      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
 | 
			
		||||
 | 
			
		||||
      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
 | 
			
		||||
        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
 | 
			
		||||
      :
 | 
			
		||||
  );
 | 
			
		||||
      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
 | 
			
		||||
        [vqs0_r] "+r"(vqs0_r)
 | 
			
		||||
      :);
 | 
			
		||||
 | 
			
		||||
  vqs0 = vqs0_l | vqs0_r;
 | 
			
		||||
  vqs1 = vqs1_l | vqs1_r;
 | 
			
		||||
@@ -196,9 +189,8 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
  *qs1 = vqs1 ^ N128;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
                                 uint32_t ps1, uint32_t ps0,
 | 
			
		||||
                                 uint32_t qs0, uint32_t qs1,
 | 
			
		||||
static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
 | 
			
		||||
                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
 | 
			
		||||
                                 uint32_t *p1_f0, uint32_t *p0_f0,
 | 
			
		||||
                                 uint32_t *q0_f0, uint32_t *q1_f0) {
 | 
			
		||||
  int32_t vpx_filter_l, vpx_filter_r;
 | 
			
		||||
@@ -273,16 +265,14 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
 | 
			
		||||
      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 | 
			
		||||
 | 
			
		||||
      : [vpx_filter_l] "=&r" (vpx_filter_l),
 | 
			
		||||
        [vpx_filter_r] "=&r" (vpx_filter_r),
 | 
			
		||||
      : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
 | 
			
		||||
        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
 | 
			
		||||
        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
 | 
			
		||||
      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
 | 
			
		||||
        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
 | 
			
		||||
        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
 | 
			
		||||
        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
 | 
			
		||||
        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
 | 
			
		||||
  );
 | 
			
		||||
        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
 | 
			
		||||
        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
 | 
			
		||||
        [HWM] "r"(HWM));
 | 
			
		||||
 | 
			
		||||
  /* save bottom 3 bits so that we round one side +4 and the other +3 */
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
@@ -312,11 +302,10 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
 | 
			
		||||
      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
 | 
			
		||||
        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
 | 
			
		||||
        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
 | 
			
		||||
        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
 | 
			
		||||
        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
 | 
			
		||||
        [vqs0_r] "+r"(vqs0_r)
 | 
			
		||||
      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
 | 
			
		||||
        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
 | 
			
		||||
  );
 | 
			
		||||
        [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
 | 
			
		||||
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
      /* (vpx_filter += 1) >>= 1 */
 | 
			
		||||
@@ -336,10 +325,9 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 | 
			
		||||
 | 
			
		||||
      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
 | 
			
		||||
        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
 | 
			
		||||
        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
 | 
			
		||||
      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
 | 
			
		||||
  );
 | 
			
		||||
        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
 | 
			
		||||
        [vqs1_r] "+r"(vqs1_r)
 | 
			
		||||
      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
 | 
			
		||||
 | 
			
		||||
  /* Create quad-bytes from halfword pairs */
 | 
			
		||||
  vqs0_l = vqs0_l & HWM;
 | 
			
		||||
@@ -353,10 +341,9 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
 | 
			
		||||
      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
 | 
			
		||||
 | 
			
		||||
      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
 | 
			
		||||
        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
 | 
			
		||||
      :
 | 
			
		||||
  );
 | 
			
		||||
      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
 | 
			
		||||
        [vqs0_r] "+r"(vqs0_r)
 | 
			
		||||
      :);
 | 
			
		||||
 | 
			
		||||
  vqs0 = vqs0_l | vqs0_r;
 | 
			
		||||
  vqs1 = vqs1_l | vqs1_r;
 | 
			
		||||
@@ -369,9 +356,8 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
 | 
			
		||||
  *q1_f0 = vqs1 ^ N128;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
 | 
			
		||||
                                  uint32_t *op1, uint32_t *op0,
 | 
			
		||||
                                  uint32_t *oq0, uint32_t *oq1,
 | 
			
		||||
static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
 | 
			
		||||
                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
 | 
			
		||||
                                  uint32_t *oq2, uint32_t *oq3) {
 | 
			
		||||
  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
 | 
			
		||||
  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
 | 
			
		||||
@@ -428,15 +414,12 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
 | 
			
		||||
      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
 | 
			
		||||
      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
 | 
			
		||||
 | 
			
		||||
      : [add_p210_q012] "=&r" (add_p210_q012),
 | 
			
		||||
        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
 | 
			
		||||
        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
 | 
			
		||||
        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
 | 
			
		||||
        [res_oq2] "=&r" (res_oq2)
 | 
			
		||||
      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
 | 
			
		||||
        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
 | 
			
		||||
        [u32Four] "r" (u32Four)
 | 
			
		||||
  );
 | 
			
		||||
      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
 | 
			
		||||
        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
 | 
			
		||||
        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
 | 
			
		||||
        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
 | 
			
		||||
      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
 | 
			
		||||
        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
 | 
			
		||||
 | 
			
		||||
  *op2 = res_op2;
 | 
			
		||||
  *op1 = res_op1;
 | 
			
		||||
@@ -446,11 +429,9 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
 | 
			
		||||
  *oq2 = res_oq2;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
 | 
			
		||||
                                   uint32_t p1, uint32_t p0,
 | 
			
		||||
                                   uint32_t q0, uint32_t q1,
 | 
			
		||||
                                   uint32_t q2, uint32_t q3,
 | 
			
		||||
                                   uint32_t *op2_f1,
 | 
			
		||||
static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
 | 
			
		||||
                                   uint32_t p0, uint32_t q0, uint32_t q1,
 | 
			
		||||
                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
 | 
			
		||||
                                   uint32_t *op1_f1, uint32_t *op0_f1,
 | 
			
		||||
                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
 | 
			
		||||
                                   uint32_t *oq2_f1) {
 | 
			
		||||
@@ -511,10 +492,8 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
 | 
			
		||||
        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
 | 
			
		||||
        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
 | 
			
		||||
        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
 | 
			
		||||
      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
 | 
			
		||||
        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
 | 
			
		||||
        [u32Four] "r" (u32Four)
 | 
			
		||||
  );
 | 
			
		||||
      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
 | 
			
		||||
        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
 | 
			
		||||
 | 
			
		||||
  *op2_f1 = res_op2;
 | 
			
		||||
  *op1_f1 = res_op1;
 | 
			
		||||
@@ -524,14 +503,11 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
 | 
			
		||||
  *oq2_f1 = res_oq2;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
 | 
			
		||||
                                       uint32_t *op5, uint32_t *op4,
 | 
			
		||||
                                       uint32_t *op3, uint32_t *op2,
 | 
			
		||||
                                       uint32_t *op1, uint32_t *op0,
 | 
			
		||||
                                       uint32_t *oq0, uint32_t *oq1,
 | 
			
		||||
                                       uint32_t *oq2, uint32_t *oq3,
 | 
			
		||||
                                       uint32_t *oq4, uint32_t *oq5,
 | 
			
		||||
                                       uint32_t *oq6, uint32_t *oq7) {
 | 
			
		||||
static INLINE void wide_mbfilter_dspr2(
 | 
			
		||||
    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
 | 
			
		||||
    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
 | 
			
		||||
    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
 | 
			
		||||
    uint32_t *oq7) {
 | 
			
		||||
  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
 | 
			
		||||
  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
 | 
			
		||||
  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
 | 
			
		||||
@@ -561,12 +537,10 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
 | 
			
		||||
      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
 | 
			
		||||
 | 
			
		||||
      : [add_p6toq6] "=&r"(add_p6toq6)
 | 
			
		||||
      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
 | 
			
		||||
        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
 | 
			
		||||
        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
 | 
			
		||||
        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
 | 
			
		||||
        [u32Eight] "r" (u32Eight)
 | 
			
		||||
  );
 | 
			
		||||
      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
 | 
			
		||||
        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
 | 
			
		||||
        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
 | 
			
		||||
        [u32Eight] "r"(u32Eight));
 | 
			
		||||
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
 | 
			
		||||
@@ -647,12 +621,10 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
 | 
			
		||||
        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
 | 
			
		||||
        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
 | 
			
		||||
        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
 | 
			
		||||
      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
 | 
			
		||||
        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
 | 
			
		||||
        [q2] "r" (q2), [q1] "r" (q1),
 | 
			
		||||
      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
 | 
			
		||||
        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
 | 
			
		||||
        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
 | 
			
		||||
        [add_p6toq6] "r" (add_p6toq6)
 | 
			
		||||
  );
 | 
			
		||||
        [add_p6toq6] "r"(add_p6toq6));
 | 
			
		||||
 | 
			
		||||
  *op6 = res_op6;
 | 
			
		||||
  *op5 = res_op5;
 | 
			
		||||
@@ -741,12 +713,10 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
 | 
			
		||||
        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
 | 
			
		||||
        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
 | 
			
		||||
        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
 | 
			
		||||
      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
 | 
			
		||||
        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
 | 
			
		||||
        [p1] "r" (p1), [p2] "r" (p2),
 | 
			
		||||
      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
 | 
			
		||||
        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
 | 
			
		||||
        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
 | 
			
		||||
        [add_p6toq6] "r" (add_p6toq6)
 | 
			
		||||
  );
 | 
			
		||||
        [add_p6toq6] "r"(add_p6toq6));
 | 
			
		||||
 | 
			
		||||
  *oq0 = res_oq0;
 | 
			
		||||
  *oq1 = res_oq1;
 | 
			
		||||
 
 | 
			
		||||
@@ -22,7 +22,8 @@ extern "C" {
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
#define STORE_F0() {                                                    \
 | 
			
		||||
#define STORE_F0()                                                       \
 | 
			
		||||
  {                                                                      \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "sb     %[q1_f0],    1(%[s4])           \n\t"                    \
 | 
			
		||||
        "sb     %[q0_f0],    0(%[s4])           \n\t"                    \
 | 
			
		||||
@@ -30,10 +31,8 @@ extern "C" {
 | 
			
		||||
        "sb     %[p1_f0],   -2(%[s4])           \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        :                                                                \
 | 
			
		||||
        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
 | 
			
		||||
          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
 | 
			
		||||
          [s4] "r" (s4)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
 | 
			
		||||
          [p1_f0] "r"(p1_f0), [s4] "r"(s4));                             \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
 | 
			
		||||
@@ -41,10 +40,9 @@ extern "C" {
 | 
			
		||||
        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
 | 
			
		||||
        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
 | 
			
		||||
          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
 | 
			
		||||
          [p1_f0] "+r"(p1_f0)                                            \
 | 
			
		||||
        :);                                                              \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "sb     %[q1_f0],    1(%[s3])           \n\t"                    \
 | 
			
		||||
@@ -53,9 +51,8 @@ extern "C" {
 | 
			
		||||
        "sb     %[p1_f0],   -2(%[s3])           \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        : [p1_f0] "+r"(p1_f0)                                            \
 | 
			
		||||
        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
 | 
			
		||||
          [s3] "r" (s3), [p0_f0] "r" (p0_f0)                            \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3),          \
 | 
			
		||||
          [p0_f0] "r"(p0_f0));                                           \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
 | 
			
		||||
@@ -63,10 +60,9 @@ extern "C" {
 | 
			
		||||
        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
 | 
			
		||||
        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
 | 
			
		||||
          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
 | 
			
		||||
          [p1_f0] "+r"(p1_f0)                                            \
 | 
			
		||||
        :);                                                              \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "sb     %[q1_f0],    1(%[s2])           \n\t"                    \
 | 
			
		||||
@@ -75,10 +71,8 @@ extern "C" {
 | 
			
		||||
        "sb     %[p1_f0],   -2(%[s2])           \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        :                                                                \
 | 
			
		||||
        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
 | 
			
		||||
          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
 | 
			
		||||
          [s2] "r" (s2)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
 | 
			
		||||
          [p1_f0] "r"(p1_f0), [s2] "r"(s2));                             \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
 | 
			
		||||
@@ -86,10 +80,9 @@ extern "C" {
 | 
			
		||||
        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
 | 
			
		||||
        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
 | 
			
		||||
          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
 | 
			
		||||
          [p1_f0] "+r"(p1_f0)                                            \
 | 
			
		||||
        :);                                                              \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    __asm__ __volatile__(                                                \
 | 
			
		||||
        "sb     %[q1_f0],    1(%[s1])           \n\t"                    \
 | 
			
		||||
@@ -98,13 +91,12 @@ extern "C" {
 | 
			
		||||
        "sb     %[p1_f0],   -2(%[s1])           \n\t"                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
        :                                                                \
 | 
			
		||||
        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
 | 
			
		||||
          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
 | 
			
		||||
          [s1] "r" (s1)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
 | 
			
		||||
          [p1_f0] "r"(p1_f0), [s1] "r"(s1));                             \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define STORE_F1() {                                                    \
 | 
			
		||||
#define STORE_F1()                                                             \
 | 
			
		||||
  {                                                                            \
 | 
			
		||||
    __asm__ __volatile__(                                                      \
 | 
			
		||||
        "sb     %[q2_r],     2(%[s4])           \n\t"                          \
 | 
			
		||||
        "sb     %[q1_r],     1(%[s4])           \n\t"                          \
 | 
			
		||||
@@ -115,9 +107,7 @@ extern "C" {
 | 
			
		||||
                                                                               \
 | 
			
		||||
        :                                                                      \
 | 
			
		||||
        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
 | 
			
		||||
          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
 | 
			
		||||
          [s4] "r" (s4)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    __asm__ __volatile__(                                                      \
 | 
			
		||||
        "srl    %[q2_r],    %[q2_r],    16      \n\t"                          \
 | 
			
		||||
@@ -129,8 +119,7 @@ extern "C" {
 | 
			
		||||
                                                                               \
 | 
			
		||||
        : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r),             \
 | 
			
		||||
          [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r)              \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        :);                                                                    \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    __asm__ __volatile__(                                                      \
 | 
			
		||||
        "sb     %[q2_r],     2(%[s3])           \n\t"                          \
 | 
			
		||||
@@ -142,9 +131,7 @@ extern "C" {
 | 
			
		||||
                                                                               \
 | 
			
		||||
        :                                                                      \
 | 
			
		||||
        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
 | 
			
		||||
          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
 | 
			
		||||
          [s3] "r" (s3)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    __asm__ __volatile__(                                                      \
 | 
			
		||||
        "sb     %[q2_l],     2(%[s2])           \n\t"                          \
 | 
			
		||||
@@ -156,9 +143,7 @@ extern "C" {
 | 
			
		||||
                                                                               \
 | 
			
		||||
        :                                                                      \
 | 
			
		||||
        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
 | 
			
		||||
          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
 | 
			
		||||
          [s2] "r" (s2)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    __asm__ __volatile__(                                                      \
 | 
			
		||||
        "srl    %[q2_l],    %[q2_l],    16      \n\t"                          \
 | 
			
		||||
@@ -170,8 +155,7 @@ extern "C" {
 | 
			
		||||
                                                                               \
 | 
			
		||||
        : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l),             \
 | 
			
		||||
          [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l)              \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        :);                                                                    \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    __asm__ __volatile__(                                                      \
 | 
			
		||||
        "sb     %[q2_l],     2(%[s1])           \n\t"                          \
 | 
			
		||||
@@ -183,12 +167,11 @@ extern "C" {
 | 
			
		||||
                                                                               \
 | 
			
		||||
        :                                                                      \
 | 
			
		||||
        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
 | 
			
		||||
          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
 | 
			
		||||
          [s1] "r" (s1)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define STORE_F2() {                                                    \
 | 
			
		||||
#define STORE_F2()                                                 \
 | 
			
		||||
  {                                                                \
 | 
			
		||||
    __asm__ __volatile__(                                          \
 | 
			
		||||
        "sb     %[q6_r],     6(%[s4])           \n\t"              \
 | 
			
		||||
        "sb     %[q5_r],     5(%[s4])           \n\t"              \
 | 
			
		||||
@@ -208,12 +191,9 @@ extern "C" {
 | 
			
		||||
        :                                                          \
 | 
			
		||||
        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
 | 
			
		||||
          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
 | 
			
		||||
          [q0_r] "r" (q0_r),                                            \
 | 
			
		||||
          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
 | 
			
		||||
          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
 | 
			
		||||
          [p6_r] "r" (p6_r),                                            \
 | 
			
		||||
          [s4] "r" (s4)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
 | 
			
		||||
          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
 | 
			
		||||
          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4));       \
 | 
			
		||||
                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                          \
 | 
			
		||||
        "srl    %[q6_r],    %[q6_r],    16      \n\t"              \
 | 
			
		||||
@@ -233,12 +213,10 @@ extern "C" {
 | 
			
		||||
                                                                   \
 | 
			
		||||
        : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
 | 
			
		||||
          [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
 | 
			
		||||
          [q0_r] "+r" (q0_r),                                           \
 | 
			
		||||
          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r),   \
 | 
			
		||||
          [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r),   \
 | 
			
		||||
          [p6_r] "+r" (p6_r)                                            \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
 | 
			
		||||
          [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
 | 
			
		||||
          [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r)                     \
 | 
			
		||||
        :);                                                        \
 | 
			
		||||
                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                          \
 | 
			
		||||
        "sb     %[q6_r],     6(%[s3])           \n\t"              \
 | 
			
		||||
@@ -259,12 +237,9 @@ extern "C" {
 | 
			
		||||
        :                                                          \
 | 
			
		||||
        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
 | 
			
		||||
          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
 | 
			
		||||
          [q0_r] "r" (q0_r),                                            \
 | 
			
		||||
          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
 | 
			
		||||
          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
 | 
			
		||||
          [p6_r] "r" (p6_r),                                            \
 | 
			
		||||
          [s3] "r" (s3)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
 | 
			
		||||
          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
 | 
			
		||||
          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3));       \
 | 
			
		||||
                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                          \
 | 
			
		||||
        "sb     %[q6_l],     6(%[s2])           \n\t"              \
 | 
			
		||||
@@ -285,12 +260,9 @@ extern "C" {
 | 
			
		||||
        :                                                          \
 | 
			
		||||
        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
 | 
			
		||||
          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
 | 
			
		||||
          [q0_l] "r" (q0_l),                                            \
 | 
			
		||||
          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
 | 
			
		||||
          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
 | 
			
		||||
          [p6_l] "r" (p6_l),                                            \
 | 
			
		||||
          [s2] "r" (s2)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
 | 
			
		||||
          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
 | 
			
		||||
          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2));       \
 | 
			
		||||
                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                          \
 | 
			
		||||
        "srl    %[q6_l],    %[q6_l],    16     \n\t"               \
 | 
			
		||||
@@ -310,12 +282,10 @@ extern "C" {
 | 
			
		||||
                                                                   \
 | 
			
		||||
        : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
 | 
			
		||||
          [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
 | 
			
		||||
          [q0_l] "+r" (q0_l),                                           \
 | 
			
		||||
          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l),   \
 | 
			
		||||
          [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l),   \
 | 
			
		||||
          [p6_l] "+r" (p6_l)                                            \
 | 
			
		||||
        :                                                               \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
 | 
			
		||||
          [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
 | 
			
		||||
          [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l)                     \
 | 
			
		||||
        :);                                                        \
 | 
			
		||||
                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                          \
 | 
			
		||||
        "sb     %[q6_l],     6(%[s1])           \n\t"              \
 | 
			
		||||
@@ -336,15 +306,13 @@ extern "C" {
 | 
			
		||||
        :                                                          \
 | 
			
		||||
        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
 | 
			
		||||
          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
 | 
			
		||||
          [q0_l] "r" (q0_l),                                            \
 | 
			
		||||
          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
 | 
			
		||||
          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
 | 
			
		||||
          [p6_l] "r" (p6_l),                                            \
 | 
			
		||||
          [s1] "r" (s1)                                                 \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
 | 
			
		||||
          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
 | 
			
		||||
          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1));       \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PACK_LEFT_0TO3() {                                              \
 | 
			
		||||
#define PACK_LEFT_0TO3()                                              \
 | 
			
		||||
  {                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                             \
 | 
			
		||||
        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                     \
 | 
			
		||||
@@ -355,16 +323,15 @@ extern "C" {
 | 
			
		||||
        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                     \
 | 
			
		||||
                                                                      \
 | 
			
		||||
        : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l),                     \
 | 
			
		||||
          [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l),                     \
 | 
			
		||||
          [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l),                     \
 | 
			
		||||
        : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
 | 
			
		||||
          [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
 | 
			
		||||
          [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l)                      \
 | 
			
		||||
        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
 | 
			
		||||
          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PACK_LEFT_4TO7() {                                              \
 | 
			
		||||
#define PACK_LEFT_4TO7()                                              \
 | 
			
		||||
  {                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                             \
 | 
			
		||||
        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                     \
 | 
			
		||||
@@ -375,16 +342,15 @@ extern "C" {
 | 
			
		||||
        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                     \
 | 
			
		||||
                                                                      \
 | 
			
		||||
        : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l),                     \
 | 
			
		||||
          [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l),                     \
 | 
			
		||||
          [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l),                     \
 | 
			
		||||
        : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
 | 
			
		||||
          [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
 | 
			
		||||
          [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l)                      \
 | 
			
		||||
        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
 | 
			
		||||
          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PACK_RIGHT_0TO3() {                                             \
 | 
			
		||||
#define PACK_RIGHT_0TO3()                                             \
 | 
			
		||||
  {                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                             \
 | 
			
		||||
        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                      \
 | 
			
		||||
        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                     \
 | 
			
		||||
@@ -395,16 +361,15 @@ extern "C" {
 | 
			
		||||
        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                     \
 | 
			
		||||
                                                                      \
 | 
			
		||||
        : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r),                     \
 | 
			
		||||
          [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r),                     \
 | 
			
		||||
          [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r),                     \
 | 
			
		||||
        : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
 | 
			
		||||
          [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
 | 
			
		||||
          [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r)                      \
 | 
			
		||||
        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
 | 
			
		||||
          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PACK_RIGHT_4TO7() {                                             \
 | 
			
		||||
#define PACK_RIGHT_4TO7()                                             \
 | 
			
		||||
  {                                                                   \
 | 
			
		||||
    __asm__ __volatile__(                                             \
 | 
			
		||||
        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                     \
 | 
			
		||||
@@ -415,16 +380,15 @@ extern "C" {
 | 
			
		||||
        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                     \
 | 
			
		||||
        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                     \
 | 
			
		||||
                                                                      \
 | 
			
		||||
        : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r),                     \
 | 
			
		||||
          [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r),                     \
 | 
			
		||||
          [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r),                     \
 | 
			
		||||
        : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
 | 
			
		||||
          [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
 | 
			
		||||
          [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r)                      \
 | 
			
		||||
        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
 | 
			
		||||
          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define COMBINE_LEFT_RIGHT_0TO2() {                                     \
 | 
			
		||||
#define COMBINE_LEFT_RIGHT_0TO2()                                         \
 | 
			
		||||
  {                                                                       \
 | 
			
		||||
    __asm__ __volatile__(                                                 \
 | 
			
		||||
        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"              \
 | 
			
		||||
        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"              \
 | 
			
		||||
@@ -433,18 +397,16 @@ extern "C" {
 | 
			
		||||
        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"              \
 | 
			
		||||
        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"              \
 | 
			
		||||
                                                                          \
 | 
			
		||||
        : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),            \
 | 
			
		||||
          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2)             \
 | 
			
		||||
        : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r),                         \
 | 
			
		||||
          [p1_l] "r" (p1_l), [p1_r] "r" (p1_r),                         \
 | 
			
		||||
          [p0_l] "r" (p0_l), [p0_r] "r" (p0_r),                         \
 | 
			
		||||
          [q0_l] "r" (q0_l), [q0_r] "r" (q0_r),                         \
 | 
			
		||||
          [q1_l] "r" (q1_l), [q1_r] "r" (q1_r),                         \
 | 
			
		||||
          [q2_l] "r" (q2_l), [q2_r] "r" (q2_r)                          \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
 | 
			
		||||
          [q1] "=&r"(q1), [q2] "=&r"(q2)                                  \
 | 
			
		||||
        : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l),           \
 | 
			
		||||
          [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r),           \
 | 
			
		||||
          [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l),           \
 | 
			
		||||
          [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r));          \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define COMBINE_LEFT_RIGHT_3TO6() {                                     \
 | 
			
		||||
#define COMBINE_LEFT_RIGHT_3TO6()                                         \
 | 
			
		||||
  {                                                                       \
 | 
			
		||||
    __asm__ __volatile__(                                                 \
 | 
			
		||||
        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"              \
 | 
			
		||||
        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"              \
 | 
			
		||||
@@ -455,19 +417,14 @@ extern "C" {
 | 
			
		||||
        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"              \
 | 
			
		||||
        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"              \
 | 
			
		||||
                                                                          \
 | 
			
		||||
        : [p6] "=&r" (p6),[p5] "=&r" (p5),                              \
 | 
			
		||||
          [p4] "=&r" (p4),[p3] "=&r" (p3),                              \
 | 
			
		||||
          [q3] "=&r" (q3),[q4] "=&r" (q4),                              \
 | 
			
		||||
          [q5] "=&r" (q5),[q6] "=&r" (q6)                               \
 | 
			
		||||
        : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l),                         \
 | 
			
		||||
          [p4_l] "r" (p4_l), [p3_l] "r" (p3_l),                         \
 | 
			
		||||
          [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),                         \
 | 
			
		||||
          [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),                         \
 | 
			
		||||
          [q3_l] "r" (q3_l), [q4_l] "r" (q4_l),                         \
 | 
			
		||||
          [q5_l] "r" (q5_l), [q6_l] "r" (q6_l),                         \
 | 
			
		||||
          [q3_r] "r" (q3_r), [q4_r] "r" (q4_r),                         \
 | 
			
		||||
          [q5_r] "r" (q5_r), [q6_r] "r" (q6_r)                          \
 | 
			
		||||
    );                                                                  \
 | 
			
		||||
        : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
 | 
			
		||||
          [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6)  \
 | 
			
		||||
        : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),           \
 | 
			
		||||
          [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r),           \
 | 
			
		||||
          [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l),           \
 | 
			
		||||
          [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l),           \
 | 
			
		||||
          [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),           \
 | 
			
		||||
          [q6_r] "r"(q6_r));                                              \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#endif  // #if HAVE_DSPR2
 | 
			
		||||
 
 | 
			
		||||
@@ -25,9 +25,8 @@ extern "C" {
 | 
			
		||||
/* processing 4 pixels at the same time
 | 
			
		||||
 * compute hev and mask in the same function */
 | 
			
		||||
static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
 | 
			
		||||
                                         uint32_t p1, uint32_t p0,
 | 
			
		||||
                                         uint32_t p3, uint32_t p2,
 | 
			
		||||
                                         uint32_t q0, uint32_t q1,
 | 
			
		||||
                                         uint32_t p1, uint32_t p0, uint32_t p3,
 | 
			
		||||
                                         uint32_t p2, uint32_t q0, uint32_t q1,
 | 
			
		||||
                                         uint32_t q2, uint32_t q3,
 | 
			
		||||
                                         uint32_t thresh, uint32_t *hev,
 | 
			
		||||
                                         uint32_t *mask) {
 | 
			
		||||
@@ -88,12 +87,10 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
 | 
			
		||||
      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
 | 
			
		||||
      "or             %[r],   %[r],      %[c]         \n\t"
 | 
			
		||||
 | 
			
		||||
      : [c] "=&r" (c), [r_k] "=&r" (r_k),
 | 
			
		||||
        [r] "=&r" (r), [r3] "=&r" (r3)
 | 
			
		||||
      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
 | 
			
		||||
        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
 | 
			
		||||
        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
 | 
			
		||||
  );
 | 
			
		||||
      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
 | 
			
		||||
      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
 | 
			
		||||
        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
 | 
			
		||||
        [thresh] "r"(thresh));
 | 
			
		||||
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
      /* abs(p0 - q0) */
 | 
			
		||||
@@ -121,24 +118,17 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
 | 
			
		||||
 | 
			
		||||
      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
 | 
			
		||||
        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
 | 
			
		||||
      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
 | 
			
		||||
        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
 | 
			
		||||
        [ones] "r"(ones), [flimit] "r"(flimit));
 | 
			
		||||
 | 
			
		||||
  *hev = hev1;
 | 
			
		||||
  *mask = s2;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
 | 
			
		||||
                                                   uint32_t flimit,
 | 
			
		||||
                                                   uint32_t thresh,
 | 
			
		||||
                                                   uint32_t p1, uint32_t p0,
 | 
			
		||||
                                                   uint32_t p3, uint32_t p2,
 | 
			
		||||
                                                   uint32_t q0, uint32_t q1,
 | 
			
		||||
                                                   uint32_t q2, uint32_t q3,
 | 
			
		||||
                                                   uint32_t *hev,
 | 
			
		||||
                                                   uint32_t *mask,
 | 
			
		||||
                                                   uint32_t *flat) {
 | 
			
		||||
static INLINE void filter_hev_mask_flatmask4_dspr2(
 | 
			
		||||
    uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
 | 
			
		||||
    uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
 | 
			
		||||
    uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
 | 
			
		||||
  uint32_t c, r, r3, r_k, r_flat;
 | 
			
		||||
  uint32_t s1, s2, s3;
 | 
			
		||||
  uint32_t ones = 0xFFFFFFFF;
 | 
			
		||||
@@ -238,11 +228,9 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
 | 
			
		||||
 | 
			
		||||
      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
 | 
			
		||||
        [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
 | 
			
		||||
      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
 | 
			
		||||
        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
 | 
			
		||||
        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
 | 
			
		||||
        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
 | 
			
		||||
  );
 | 
			
		||||
      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
 | 
			
		||||
        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
 | 
			
		||||
        [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
 | 
			
		||||
 | 
			
		||||
  __asm__ __volatile__(
 | 
			
		||||
      /* abs(p0 - q0) */
 | 
			
		||||
@@ -270,21 +258,17 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
 | 
			
		||||
 | 
			
		||||
      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
 | 
			
		||||
        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
 | 
			
		||||
      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
 | 
			
		||||
        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
 | 
			
		||||
        [ones] "r"(ones), [flimit] "r"(flimit));
 | 
			
		||||
 | 
			
		||||
  *hev = hev1;
 | 
			
		||||
  *mask = s2;
 | 
			
		||||
  *flat = flat1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static INLINE void flatmask5(uint32_t p4, uint32_t p3,
 | 
			
		||||
                             uint32_t p2, uint32_t p1,
 | 
			
		||||
                             uint32_t p0, uint32_t q0,
 | 
			
		||||
                             uint32_t q1, uint32_t q2,
 | 
			
		||||
                             uint32_t q3, uint32_t q4,
 | 
			
		||||
                             uint32_t *flat2) {
 | 
			
		||||
static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
 | 
			
		||||
                             uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
 | 
			
		||||
                             uint32_t q3, uint32_t q4, uint32_t *flat2) {
 | 
			
		||||
  uint32_t c, r, r_k, r_flat;
 | 
			
		||||
  uint32_t ones = 0xFFFFFFFF;
 | 
			
		||||
  uint32_t flat_thresh = 0x01010101;
 | 
			
		||||
@@ -355,13 +339,11 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3,
 | 
			
		||||
      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
 | 
			
		||||
      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
 | 
			
		||||
 | 
			
		||||
      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
 | 
			
		||||
        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
 | 
			
		||||
      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
 | 
			
		||||
        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
 | 
			
		||||
        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
 | 
			
		||||
        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
 | 
			
		||||
  );
 | 
			
		||||
      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
 | 
			
		||||
        [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
 | 
			
		||||
      : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
 | 
			
		||||
        [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
 | 
			
		||||
        [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
 | 
			
		||||
 | 
			
		||||
  *flat2 = flat1;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -19,10 +19,8 @@
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
                                int pitch,
 | 
			
		||||
                                const uint8_t *blimit,
 | 
			
		||||
                                const uint8_t *limit,
 | 
			
		||||
void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
 | 
			
		||||
                                const uint8_t *blimit, const uint8_t *limit,
 | 
			
		||||
                                const uint8_t *thresh) {
 | 
			
		||||
  uint32_t mask;
 | 
			
		||||
  uint32_t hev, flat;
 | 
			
		||||
@@ -47,8 +45,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
 | 
			
		||||
        [limit_vec] "=r"(limit_vec)
 | 
			
		||||
      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 | 
			
		||||
 | 
			
		||||
  /* prefetch data for store */
 | 
			
		||||
  prefetch_store(s);
 | 
			
		||||
@@ -76,16 +73,13 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
 | 
			
		||||
          [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
 | 
			
		||||
        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
          [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
 | 
			
		||||
    );
 | 
			
		||||
          [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
 | 
			
		||||
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
 | 
			
		||||
                                    p1, p0, p3, p2, q0, q1, q2, q3,
 | 
			
		||||
                                    &hev, &mask, &flat);
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
 | 
			
		||||
                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 | 
			
		||||
 | 
			
		||||
    if ((flat == 0) && (mask != 0)) {
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "sw       %[p1_f0],   (%[sp1])    \n\t"
 | 
			
		||||
@@ -94,21 +88,17 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
          "sw       %[q1_f0],   (%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
            [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
            [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
      );
 | 
			
		||||
          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
            [sq1] "r"(sq1));
 | 
			
		||||
    } else if ((mask & flat) == 0xFFFFFFFF) {
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      COMBINE_LEFT_RIGHT_0TO2()
 | 
			
		||||
 | 
			
		||||
@@ -121,25 +111,20 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
          "sw       %[q2],      (%[sq2])    \n\t"
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
 | 
			
		||||
            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
 | 
			
		||||
            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
      );
 | 
			
		||||
          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
 | 
			
		||||
            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
 | 
			
		||||
            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
    } else if ((flat != 0) && (mask != 0)) {
 | 
			
		||||
      /* filtering */
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -153,9 +138,8 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  (%[sp1])    \n\t"
 | 
			
		||||
@@ -164,11 +148,9 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  (%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -185,10 +167,9 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
 | 
			
		||||
            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -202,9 +183,8 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
 | 
			
		||||
@@ -213,11 +193,9 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -226,12 +204,10 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
 | 
			
		||||
          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
 | 
			
		||||
            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
 | 
			
		||||
            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
 | 
			
		||||
            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -245,9 +221,8 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
 | 
			
		||||
@@ -256,11 +231,9 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -277,10 +250,9 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
 | 
			
		||||
            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -294,9 +266,8 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
 | 
			
		||||
@@ -305,11 +276,9 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -317,10 +286,8 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
                              int pitch,
 | 
			
		||||
                              const uint8_t *blimit,
 | 
			
		||||
                              const uint8_t *limit,
 | 
			
		||||
void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
 | 
			
		||||
                              const uint8_t *blimit, const uint8_t *limit,
 | 
			
		||||
                              const uint8_t *thresh) {
 | 
			
		||||
  uint8_t i;
 | 
			
		||||
  uint32_t mask, hev, flat;
 | 
			
		||||
@@ -345,8 +312,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
 | 
			
		||||
        [limit_vec] "=r"(limit_vec)
 | 
			
		||||
      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 | 
			
		||||
 | 
			
		||||
  prefetch_store(s + pitch);
 | 
			
		||||
 | 
			
		||||
@@ -369,8 +335,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
 | 
			
		||||
          [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
 | 
			
		||||
        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
			
		||||
    );
 | 
			
		||||
        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 | 
			
		||||
 | 
			
		||||
    /* transpose p3, p2, p1, p0
 | 
			
		||||
       original (when loaded from memory)
 | 
			
		||||
@@ -403,12 +368,10 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
        "append         %[p1],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[p3],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
 | 
			
		||||
          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    /* transpose q0, q1, q2, q3
 | 
			
		||||
       original (when loaded from memory)
 | 
			
		||||
@@ -441,46 +404,37 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
        "append         %[q2],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[q0],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
 | 
			
		||||
          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
 | 
			
		||||
                                    p1, p0, p3, p2, q0, q1, q2, q3,
 | 
			
		||||
                                    &hev, &mask, &flat);
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
 | 
			
		||||
                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 | 
			
		||||
 | 
			
		||||
    if ((flat == 0) && (mask != 0)) {
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      STORE_F0()
 | 
			
		||||
    } else if ((mask & flat) == 0xFFFFFFFF) {
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      STORE_F1()
 | 
			
		||||
    } else if ((flat != 0) && (mask != 0)) {
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -494,8 +448,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
              [s4] "r"(s4));
 | 
			
		||||
      } else if (mask & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  -2(%[s4])    \n\t"
 | 
			
		||||
@@ -504,10 +457,8 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  +1(%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -524,10 +475,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
 | 
			
		||||
            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -541,8 +491,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
              [s3] "r"(s3));
 | 
			
		||||
      } else if (mask & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  -2(%[s3])    \n\t"
 | 
			
		||||
@@ -551,10 +500,8 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  +1(%[s3])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -563,12 +510,10 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
 | 
			
		||||
          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
 | 
			
		||||
            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
 | 
			
		||||
            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
 | 
			
		||||
            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -582,8 +527,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
            [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
              [s2] "r"(s2));
 | 
			
		||||
      } else if (mask & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  -2(%[s2])    \n\t"
 | 
			
		||||
@@ -592,10 +536,8 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  +1(%[s2])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -612,10 +554,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
 | 
			
		||||
            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -629,8 +570,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [s1] "r"(s1));
 | 
			
		||||
      } else if (mask & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  -2(%[s1])    \n\t"
 | 
			
		||||
@@ -640,8 +580,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r" (q1_f0), [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -19,12 +19,9 @@
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
                                   int pitch,
 | 
			
		||||
                                   const uint8_t *blimit,
 | 
			
		||||
                                   const uint8_t *limit,
 | 
			
		||||
                                   const uint8_t *thresh,
 | 
			
		||||
                                   int count) {
 | 
			
		||||
static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
 | 
			
		||||
                                   const uint8_t *blimit, const uint8_t *limit,
 | 
			
		||||
                                   const uint8_t *thresh, int count) {
 | 
			
		||||
  uint32_t mask;
 | 
			
		||||
  uint32_t hev, flat, flat2;
 | 
			
		||||
  uint8_t i;
 | 
			
		||||
@@ -53,8 +50,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
 | 
			
		||||
        [limit_vec] "=r"(limit_vec)
 | 
			
		||||
      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 | 
			
		||||
 | 
			
		||||
  /* prefetch data for store */
 | 
			
		||||
  prefetch_store(s);
 | 
			
		||||
@@ -90,8 +86,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
 | 
			
		||||
          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
 | 
			
		||||
        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
 | 
			
		||||
    );
 | 
			
		||||
          [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lw     %[q0],      (%[sq0])            \n\t"
 | 
			
		||||
@@ -106,20 +101,17 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
 | 
			
		||||
          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
 | 
			
		||||
        : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
 | 
			
		||||
          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
 | 
			
		||||
    );
 | 
			
		||||
          [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
 | 
			
		||||
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
 | 
			
		||||
                                    p1, p0, p3, p2, q0, q1, q2, q3,
 | 
			
		||||
                                    &hev, &mask, &flat);
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
 | 
			
		||||
                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 | 
			
		||||
 | 
			
		||||
    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 | 
			
		||||
 | 
			
		||||
    /* f0 */
 | 
			
		||||
    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
 | 
			
		||||
        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "sw       %[p1_f0],   (%[sp1])            \n\t"
 | 
			
		||||
@@ -128,27 +120,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
          "sw       %[q1_f0],   (%[sq1])            \n\t"
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
            [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
            [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
      );
 | 
			
		||||
          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
            [sq1] "r"(sq1));
 | 
			
		||||
    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
 | 
			
		||||
               (mask == 0xFFFFFFFF)) {
 | 
			
		||||
      /* f2 */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      PACK_LEFT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
 | 
			
		||||
                          &p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                          &q0_l, &q1_l, &q2_l, &q3_l,
 | 
			
		||||
                          &q4_l, &q5_l, &q6_l, &q7_l);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
 | 
			
		||||
                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
 | 
			
		||||
                          &q6_l, &q7_l);
 | 
			
		||||
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      PACK_RIGHT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
 | 
			
		||||
                          &p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                          &q0_r, &q1_r, &q2_r, &q3_r,
 | 
			
		||||
                          &q4_r, &q5_r, &q6_r, &q7_r);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
 | 
			
		||||
                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
 | 
			
		||||
                          &q6_r, &q7_r);
 | 
			
		||||
 | 
			
		||||
      COMBINE_LEFT_RIGHT_0TO2()
 | 
			
		||||
      COMBINE_LEFT_RIGHT_3TO6()
 | 
			
		||||
@@ -164,10 +152,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
 | 
			
		||||
            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
 | 
			
		||||
            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
 | 
			
		||||
            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
 | 
			
		||||
      );
 | 
			
		||||
            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
 | 
			
		||||
            [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
 | 
			
		||||
            [sp1] "r"(sp1), [sp0] "r"(sp0));
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "sw         %[q6], (%[sq6])    \n\t"
 | 
			
		||||
@@ -180,21 +167,18 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
 | 
			
		||||
            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
 | 
			
		||||
            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
 | 
			
		||||
            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
 | 
			
		||||
      );
 | 
			
		||||
            [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
 | 
			
		||||
            [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
 | 
			
		||||
            [sq1] "r"(sq1), [sq0] "r"(sq0));
 | 
			
		||||
    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
 | 
			
		||||
      /* f1 */
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      COMBINE_LEFT_RIGHT_0TO2()
 | 
			
		||||
 | 
			
		||||
@@ -207,25 +191,20 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
          "sw         %[q2], (%[sq2])    \n\t"
 | 
			
		||||
 | 
			
		||||
          :
 | 
			
		||||
          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
 | 
			
		||||
            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
 | 
			
		||||
            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
      );
 | 
			
		||||
          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
 | 
			
		||||
            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
 | 
			
		||||
            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
 | 
			
		||||
      /* f0+f1 */
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -239,9 +218,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  (%[sp1])    \n\t"
 | 
			
		||||
@@ -250,11 +228,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  (%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -271,10 +247,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
 | 
			
		||||
            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -288,9 +263,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
 | 
			
		||||
@@ -299,11 +273,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -312,10 +284,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
 | 
			
		||||
          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -329,9 +300,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
 | 
			
		||||
@@ -340,11 +310,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -361,10 +329,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
 | 
			
		||||
            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -378,9 +345,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
 | 
			
		||||
              [sq1] "r"(sq1), [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
 | 
			
		||||
@@ -389,45 +355,36 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
 | 
			
		||||
      /* f0 + f1 + f2 */
 | 
			
		||||
      /* f0  function */
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      /* f1  function */
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
 | 
			
		||||
                      q0_l, q1_l, q2_l, q3_l,
 | 
			
		||||
                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
 | 
			
		||||
                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 | 
			
		||||
      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
 | 
			
		||||
                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
 | 
			
		||||
                      q0_r, q1_r, q2_r, q3_r,
 | 
			
		||||
                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
 | 
			
		||||
                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 | 
			
		||||
      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
 | 
			
		||||
                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
 | 
			
		||||
 | 
			
		||||
      /* f2  function */
 | 
			
		||||
      PACK_LEFT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
 | 
			
		||||
                          &p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                          &q0_l, &q1_l, &q2_l, &q3_l,
 | 
			
		||||
                          &q4_l, &q5_l, &q6_l, &q7_l);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
 | 
			
		||||
                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
 | 
			
		||||
                          &q6_l, &q7_l);
 | 
			
		||||
 | 
			
		||||
      PACK_RIGHT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
 | 
			
		||||
                          &p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                          &q0_r, &q1_r, &q2_r, &q3_r,
 | 
			
		||||
                          &q4_r, &q5_r, &q6_r, &q7_r);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
 | 
			
		||||
                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
 | 
			
		||||
                          &q6_r, &q7_r);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -442,10 +399,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
 | 
			
		||||
              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
 | 
			
		||||
              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
 | 
			
		||||
              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
 | 
			
		||||
              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
 | 
			
		||||
        );
 | 
			
		||||
              [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
 | 
			
		||||
              [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[q0_r],  (%[sq0])    \n\t"
 | 
			
		||||
@@ -459,11 +414,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
 | 
			
		||||
              [q6_r] "r" (q6_r),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
 | 
			
		||||
              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
 | 
			
		||||
              [sq6] "r" (sq6)
 | 
			
		||||
        );
 | 
			
		||||
              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
 | 
			
		||||
              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
 | 
			
		||||
      } else if (mask & flat & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
 | 
			
		||||
@@ -476,10 +428,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
 | 
			
		||||
              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
 | 
			
		||||
              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
 | 
			
		||||
              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
 | 
			
		||||
              [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  (%[sp1])    \n\t"
 | 
			
		||||
@@ -490,8 +441,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -515,8 +465,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
 | 
			
		||||
            [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
 | 
			
		||||
            [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
 | 
			
		||||
@@ -533,10 +482,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
 | 
			
		||||
            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
 | 
			
		||||
            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -551,10 +499,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
 | 
			
		||||
              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
 | 
			
		||||
              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
 | 
			
		||||
              [sp4] "r" (sp4), [sp3] "r" (sp3),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
 | 
			
		||||
        );
 | 
			
		||||
              [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
 | 
			
		||||
              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[q0_r],  +1(%[sq0])    \n\t"
 | 
			
		||||
@@ -568,10 +514,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
 | 
			
		||||
              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
 | 
			
		||||
              [sq2] "r" (sq2), [sq3] "r" (sq3),
 | 
			
		||||
              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
 | 
			
		||||
        );
 | 
			
		||||
              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
 | 
			
		||||
              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
 | 
			
		||||
      } else if (mask & flat & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
 | 
			
		||||
@@ -584,10 +528,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
 | 
			
		||||
              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
 | 
			
		||||
              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
 | 
			
		||||
              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
 | 
			
		||||
              [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
 | 
			
		||||
@@ -598,8 +541,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -608,10 +550,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
 | 
			
		||||
          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -626,10 +567,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
 | 
			
		||||
              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
 | 
			
		||||
              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
 | 
			
		||||
              [sp4] "r" (sp4), [sp3] "r" (sp3),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
 | 
			
		||||
        );
 | 
			
		||||
              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
 | 
			
		||||
              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[q0_l],  +2(%[sq0])    \n\t"
 | 
			
		||||
@@ -643,10 +582,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
 | 
			
		||||
              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
 | 
			
		||||
              [sq2] "r" (sq2), [sq3] "r" (sq3),
 | 
			
		||||
              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
 | 
			
		||||
        );
 | 
			
		||||
              [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
 | 
			
		||||
              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
 | 
			
		||||
      } else if (mask & flat & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
 | 
			
		||||
@@ -659,10 +596,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
 | 
			
		||||
              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
 | 
			
		||||
              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
 | 
			
		||||
              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
 | 
			
		||||
              [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
 | 
			
		||||
@@ -673,8 +609,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -698,8 +633,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
 | 
			
		||||
            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
 | 
			
		||||
            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
 | 
			
		||||
@@ -716,10 +650,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
 | 
			
		||||
            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
 | 
			
		||||
            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -734,10 +667,8 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
 | 
			
		||||
              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
 | 
			
		||||
              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
 | 
			
		||||
              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0)
 | 
			
		||||
        );
 | 
			
		||||
              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
 | 
			
		||||
              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[q0_l],    +3(%[sq0])    \n\t"
 | 
			
		||||
@@ -749,13 +680,10 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            "sb     %[q6_l],    +3(%[sq6])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
 | 
			
		||||
              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
 | 
			
		||||
              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
 | 
			
		||||
              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
 | 
			
		||||
              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
 | 
			
		||||
        );
 | 
			
		||||
            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
 | 
			
		||||
              [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
 | 
			
		||||
      } else if (mask & flat & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
 | 
			
		||||
@@ -768,10 +696,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
 | 
			
		||||
              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
 | 
			
		||||
              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
 | 
			
		||||
              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
 | 
			
		||||
              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
 | 
			
		||||
              [sq2] "r"(sq2));
 | 
			
		||||
      } else if (mask & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
 | 
			
		||||
@@ -780,11 +707,9 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
 | 
			
		||||
            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [sp1] "r" (sp1), [sp0] "r" (sp0),
 | 
			
		||||
              [sq0] "r" (sq0), [sq1] "r" (sq1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
 | 
			
		||||
              [sq0] "r"(sq0), [sq1] "r"(sq1));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -19,11 +19,8 @@
 | 
			
		||||
#include "vpx_mem/vpx_mem.h"
 | 
			
		||||
 | 
			
		||||
#if HAVE_DSPR2
 | 
			
		||||
void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
                               int pitch,
 | 
			
		||||
                               const uint8_t *blimit,
 | 
			
		||||
                               const uint8_t *limit,
 | 
			
		||||
                               const uint8_t *thresh) {
 | 
			
		||||
void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
 | 
			
		||||
                               const uint8_t *limit, const uint8_t *thresh) {
 | 
			
		||||
  uint8_t i;
 | 
			
		||||
  uint32_t mask, hev, flat, flat2;
 | 
			
		||||
  uint8_t *s1, *s2, *s3, *s4;
 | 
			
		||||
@@ -51,8 +48,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
 | 
			
		||||
      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
 | 
			
		||||
        [limit_vec] "=r"(limit_vec)
 | 
			
		||||
      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
			
		||||
  );
 | 
			
		||||
      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 | 
			
		||||
 | 
			
		||||
  prefetch_store(s + pitch);
 | 
			
		||||
 | 
			
		||||
@@ -73,11 +69,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
        "lw     %[p6],  -8(%[s3])    \n\t"
 | 
			
		||||
        "lw     %[p7],  -8(%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
 | 
			
		||||
          [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
 | 
			
		||||
          [p5] "=&r" (p5), [p4] "=&r" (p4)
 | 
			
		||||
        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
			
		||||
    );
 | 
			
		||||
        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
 | 
			
		||||
          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
 | 
			
		||||
        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 | 
			
		||||
 | 
			
		||||
    __asm__ __volatile__(
 | 
			
		||||
        "lw     %[q3],  (%[s1])     \n\t"
 | 
			
		||||
@@ -89,11 +83,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
        "lw     %[q5],  +4(%[s3])   \n\t"
 | 
			
		||||
        "lw     %[q4],  +4(%[s4])   \n\t"
 | 
			
		||||
 | 
			
		||||
        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
 | 
			
		||||
          [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
 | 
			
		||||
          [q5] "=&r" (q5), [q4] "=&r" (q4)
 | 
			
		||||
        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
			
		||||
    );
 | 
			
		||||
        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
 | 
			
		||||
          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
 | 
			
		||||
        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 | 
			
		||||
 | 
			
		||||
    /* transpose p3, p2, p1, p0
 | 
			
		||||
       original (when loaded from memory)
 | 
			
		||||
@@ -126,12 +118,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
        "append         %[p1],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[p3],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
 | 
			
		||||
          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    /* transpose q0, q1, q2, q3
 | 
			
		||||
       original (when loaded from memory)
 | 
			
		||||
@@ -164,12 +154,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
        "append         %[q2],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[q0],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
 | 
			
		||||
          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    /* transpose p7, p6, p5, p4
 | 
			
		||||
       original (when loaded from memory)
 | 
			
		||||
@@ -202,12 +190,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
        "append         %[p5],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[p7],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
 | 
			
		||||
          [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    /* transpose q4, q5, q6, q7
 | 
			
		||||
       original (when loaded from memory)
 | 
			
		||||
@@ -240,68 +226,57 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
        "append         %[q6],      %[sec3],    16          \n\t"
 | 
			
		||||
        "append         %[q4],      %[sec4],    16          \n\t"
 | 
			
		||||
 | 
			
		||||
        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
			
		||||
          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
			
		||||
          [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
 | 
			
		||||
          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
			
		||||
        :
 | 
			
		||||
    );
 | 
			
		||||
        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
 | 
			
		||||
          [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
 | 
			
		||||
          [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
 | 
			
		||||
        :);
 | 
			
		||||
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
 | 
			
		||||
                                    p1, p0, p3, p2, q0, q1, q2, q3,
 | 
			
		||||
                                    &hev, &mask, &flat);
 | 
			
		||||
    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
 | 
			
		||||
                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 | 
			
		||||
 | 
			
		||||
    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 | 
			
		||||
 | 
			
		||||
    /* f0 */
 | 
			
		||||
    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
 | 
			
		||||
        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      STORE_F0()
 | 
			
		||||
    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
 | 
			
		||||
               (mask == 0xFFFFFFFF)) {
 | 
			
		||||
      /* f2 */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      PACK_LEFT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
 | 
			
		||||
                          &p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                          &q0_l, &q1_l, &q2_l, &q3_l,
 | 
			
		||||
                          &q4_l, &q5_l, &q6_l, &q7_l);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
 | 
			
		||||
                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
 | 
			
		||||
                          &q6_l, &q7_l);
 | 
			
		||||
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      PACK_RIGHT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
 | 
			
		||||
                          &p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                          &q0_r, &q1_r, &q2_r, &q3_r,
 | 
			
		||||
                          &q4_r, &q5_r, &q6_r, &q7_r);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
 | 
			
		||||
                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
 | 
			
		||||
                          &q6_r, &q7_r);
 | 
			
		||||
 | 
			
		||||
      STORE_F2()
 | 
			
		||||
    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
 | 
			
		||||
      /* f1 */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      STORE_F1()
 | 
			
		||||
    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
 | 
			
		||||
      /* f0 + f1 */
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      /* left 2 element operation */
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                     &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 | 
			
		||||
 | 
			
		||||
      /* right 2 element operation */
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                     &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -315,8 +290,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
              [s4] "r"(s4));
 | 
			
		||||
      } else if (mask & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb         %[p1_f0],  -2(%[s4])    \n\t"
 | 
			
		||||
@@ -325,10 +299,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb         %[q1_f0],  +1(%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -345,10 +317,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
 | 
			
		||||
            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -362,8 +333,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
 | 
			
		||||
              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
              [s3] "r"(s3));
 | 
			
		||||
      } else if (mask & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s3])    \n\t"
 | 
			
		||||
@@ -372,10 +342,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s3])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -384,10 +352,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
 | 
			
		||||
          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -401,8 +368,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
            [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
              [s2] "r"(s2));
 | 
			
		||||
      } else if (mask & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s2])    \n\t"
 | 
			
		||||
@@ -411,10 +377,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s2])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -431,10 +395,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
 | 
			
		||||
          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
 | 
			
		||||
            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -448,8 +411,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
 | 
			
		||||
              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [s1] "r"(s1));
 | 
			
		||||
      } else if (mask & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s1])    \n\t"
 | 
			
		||||
@@ -458,39 +420,30 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
 | 
			
		||||
      }
 | 
			
		||||
    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
 | 
			
		||||
      /* f0+f1+f2 */
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1,
 | 
			
		||||
                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 | 
			
		||||
 | 
			
		||||
      PACK_LEFT_0TO3()
 | 
			
		||||
      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
 | 
			
		||||
                      q0_l, q1_l, q2_l, q3_l,
 | 
			
		||||
                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
 | 
			
		||||
                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 | 
			
		||||
      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
 | 
			
		||||
                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
 | 
			
		||||
 | 
			
		||||
      PACK_RIGHT_0TO3()
 | 
			
		||||
      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
 | 
			
		||||
                      q0_r, q1_r, q2_r, q3_r,
 | 
			
		||||
                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
 | 
			
		||||
                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 | 
			
		||||
      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
 | 
			
		||||
                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
 | 
			
		||||
 | 
			
		||||
      PACK_LEFT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
 | 
			
		||||
                          &p3_l, &p2_l, &p1_l, &p0_l,
 | 
			
		||||
                          &q0_l, &q1_l, &q2_l, &q3_l,
 | 
			
		||||
                          &q4_l, &q5_l, &q6_l, &q7_l);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
 | 
			
		||||
                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
 | 
			
		||||
                          &q6_l, &q7_l);
 | 
			
		||||
 | 
			
		||||
      PACK_RIGHT_4TO7()
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
 | 
			
		||||
                          &p3_r, &p2_r, &p1_r, &p0_r,
 | 
			
		||||
                          &q0_r, &q1_r, &q2_r, &q3_r,
 | 
			
		||||
                          &q4_r, &q5_r, &q6_r, &q7_r);
 | 
			
		||||
      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
 | 
			
		||||
                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
 | 
			
		||||
                          &q6_r, &q7_r);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -503,11 +456,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[p0_r],    -1(%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
 | 
			
		||||
              [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
 | 
			
		||||
              [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
 | 
			
		||||
              [p0_r] "r" (p0_r), [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
 | 
			
		||||
              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
 | 
			
		||||
              [p0_r] "r"(p0_r), [s4] "r"(s4));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[q0_r],      (%[s4])    \n\t"
 | 
			
		||||
@@ -519,11 +470,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q6_r],    +6(%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
 | 
			
		||||
              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
 | 
			
		||||
              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
 | 
			
		||||
              [q6_r] "r" (q6_r), [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
 | 
			
		||||
              [q6_r] "r"(q6_r), [s4] "r"(s4));
 | 
			
		||||
      } else if (mask & flat & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
 | 
			
		||||
@@ -536,9 +485,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
 | 
			
		||||
              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
 | 
			
		||||
              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
 | 
			
		||||
      } else if (mask & 0x000000FF) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s4])    \n\t"
 | 
			
		||||
@@ -547,10 +494,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s4])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s4] "r" (s4)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -569,15 +514,12 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
          "srl      %[q5_r],        %[q5_r],        16     \n\t"
 | 
			
		||||
          "srl      %[q6_r],        %[q6_r],        16     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
 | 
			
		||||
            [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
 | 
			
		||||
            [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
 | 
			
		||||
            [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
 | 
			
		||||
            [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
 | 
			
		||||
            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
 | 
			
		||||
          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
 | 
			
		||||
            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
 | 
			
		||||
            [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
 | 
			
		||||
            [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
 | 
			
		||||
            [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
 | 
			
		||||
@@ -594,10 +536,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
 | 
			
		||||
            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
 | 
			
		||||
            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -612,8 +553,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
 | 
			
		||||
              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
 | 
			
		||||
              [p0_r] "r" (p0_r), [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
              [p0_r] "r"(p0_r), [s3] "r"(s3));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[q0_r],      (%[s3])    \n\t"
 | 
			
		||||
@@ -625,11 +565,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q6_r],    +6(%[s3])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
 | 
			
		||||
              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
 | 
			
		||||
              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
 | 
			
		||||
              [q6_r] "r" (q6_r), [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
 | 
			
		||||
              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
 | 
			
		||||
              [q6_r] "r"(q6_r), [s3] "r"(s3));
 | 
			
		||||
      } else if (mask & flat & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
 | 
			
		||||
@@ -642,9 +580,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
 | 
			
		||||
              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
 | 
			
		||||
              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
 | 
			
		||||
              [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
 | 
			
		||||
      } else if (mask & 0x0000FF00) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s3])    \n\t"
 | 
			
		||||
@@ -653,10 +589,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s3])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s3] "r" (s3)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -665,10 +599,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
 | 
			
		||||
          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 | 
			
		||||
 | 
			
		||||
          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -683,8 +616,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
 | 
			
		||||
              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
 | 
			
		||||
              [p0_l] "r" (p0_l), [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
              [p0_l] "r"(p0_l), [s2] "r"(s2));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[q0_l],      (%[s2])    \n\t"
 | 
			
		||||
@@ -698,8 +630,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
 | 
			
		||||
              [q6_l] "r" (q6_l), [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
              [q6_l] "r"(q6_l), [s2] "r"(s2));
 | 
			
		||||
      } else if (mask & flat & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
 | 
			
		||||
@@ -712,9 +643,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
 | 
			
		||||
              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
 | 
			
		||||
              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
 | 
			
		||||
              [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
 | 
			
		||||
      } else if (mask & 0x00FF0000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s2])    \n\t"
 | 
			
		||||
@@ -723,10 +652,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s2])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s2] "r" (s2)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
@@ -750,8 +677,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
 | 
			
		||||
            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
 | 
			
		||||
            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      __asm__ __volatile__(
 | 
			
		||||
          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
 | 
			
		||||
@@ -768,10 +694,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
 | 
			
		||||
            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
 | 
			
		||||
            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
 | 
			
		||||
            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
 | 
			
		||||
            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
 | 
			
		||||
          :
 | 
			
		||||
      );
 | 
			
		||||
            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
 | 
			
		||||
            [q1_f0] "+r"(q1_f0)
 | 
			
		||||
          :);
 | 
			
		||||
 | 
			
		||||
      if (mask & flat & flat2 & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
@@ -786,9 +711,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
 | 
			
		||||
              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
 | 
			
		||||
              [p0_l] "r" (p0_l),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [p0_l] "r"(p0_l), [s1] "r"(s1));
 | 
			
		||||
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[q0_l],     (%[s1])    \n\t"
 | 
			
		||||
@@ -802,9 +725,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
 | 
			
		||||
              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
 | 
			
		||||
              [q6_l] "r" (q6_l),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [q6_l] "r"(q6_l), [s1] "r"(s1));
 | 
			
		||||
      } else if (mask & flat & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
 | 
			
		||||
@@ -817,9 +738,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            :
 | 
			
		||||
            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
 | 
			
		||||
              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
 | 
			
		||||
              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
 | 
			
		||||
      } else if (mask & 0xFF000000) {
 | 
			
		||||
        __asm__ __volatile__(
 | 
			
		||||
            "sb     %[p1_f0],   -2(%[s1])    \n\t"
 | 
			
		||||
@@ -828,10 +747,8 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
 | 
			
		||||
            "sb     %[q1_f0],   +1(%[s1])    \n\t"
 | 
			
		||||
 | 
			
		||||
            :
 | 
			
		||||
            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
 | 
			
		||||
              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
 | 
			
		||||
              [s1] "r" (s1)
 | 
			
		||||
        );
 | 
			
		||||
            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
 | 
			
		||||
              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,8 @@
 | 
			
		||||
#include "vpx_dsp/mips/macros_msa.h"
 | 
			
		||||
 | 
			
		||||
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
 | 
			
		||||
                           p1_out, p0_out, q0_out, q1_out) {             \
 | 
			
		||||
                           p1_out, p0_out, q0_out, q1_out)              \
 | 
			
		||||
  {                                                                     \
 | 
			
		||||
    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
 | 
			
		||||
    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
 | 
			
		||||
    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
 | 
			
		||||
@@ -64,7 +65,8 @@
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
 | 
			
		||||
                           p1_out, p0_out, q0_out, q1_out) {             \
 | 
			
		||||
                           p1_out, p0_out, q0_out, q1_out)              \
 | 
			
		||||
  {                                                                     \
 | 
			
		||||
    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
 | 
			
		||||
    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
 | 
			
		||||
    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
 | 
			
		||||
@@ -120,7 +122,8 @@
 | 
			
		||||
    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) {  \
 | 
			
		||||
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
 | 
			
		||||
  {                                                                   \
 | 
			
		||||
    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;    \
 | 
			
		||||
    v16u8 zero_in = { 0 };                                            \
 | 
			
		||||
                                                                      \
 | 
			
		||||
@@ -140,8 +143,9 @@
 | 
			
		||||
    flat_out = flat_out & (mask);                                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
 | 
			
		||||
                  q5_in, q6_in, q7_in, flat_in, flat2_out) {        \
 | 
			
		||||
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
 | 
			
		||||
                  q6_in, q7_in, flat_in, flat2_out)                       \
 | 
			
		||||
  {                                                                       \
 | 
			
		||||
    v16u8 tmp, zero_in = { 0 };                                           \
 | 
			
		||||
    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
 | 
			
		||||
    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
 | 
			
		||||
@@ -169,10 +173,10 @@
 | 
			
		||||
    flat2_out = flat2_out & flat_in;                                      \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                  \
 | 
			
		||||
                    q0_in, q1_in, q2_in, q3_in,                  \
 | 
			
		||||
                    p2_filt8_out, p1_filt8_out, p0_filt8_out,    \
 | 
			
		||||
                    q0_filt8_out, q1_filt8_out, q2_filt8_out) {  \
 | 
			
		||||
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
 | 
			
		||||
                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
 | 
			
		||||
                    q1_filt8_out, q2_filt8_out)                             \
 | 
			
		||||
  {                                                                         \
 | 
			
		||||
    v8u16 tmp0, tmp1, tmp2;                                                 \
 | 
			
		||||
                                                                            \
 | 
			
		||||
    tmp2 = p2_in + p1_in + p0_in;                                           \
 | 
			
		||||
@@ -207,10 +211,10 @@
 | 
			
		||||
    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
 | 
			
		||||
                     q0_in, q1_in, q2_in, q3_in,                 \
 | 
			
		||||
                     limit_in, b_limit_in, thresh_in,            \
 | 
			
		||||
                     hev_out, mask_out, flat_out) {              \
 | 
			
		||||
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
 | 
			
		||||
                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
 | 
			
		||||
                     flat_out)                                               \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
 | 
			
		||||
    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
 | 
			
		||||
                                                                             \
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -11,7 +11,8 @@
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
#include "vpx_dsp/mips/macros_msa.h"
 | 
			
		||||
 | 
			
		||||
#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
 | 
			
		||||
#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
 | 
			
		||||
  {                                                        \
 | 
			
		||||
    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
 | 
			
		||||
    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
 | 
			
		||||
    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
 | 
			
		||||
@@ -58,8 +59,8 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
 | 
			
		||||
    ref += (4 * ref_stride);
 | 
			
		||||
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
 | 
			
		||||
                src0, src1, ref0, ref1);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
 | 
			
		||||
                ref0, ref1);
 | 
			
		||||
    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -214,8 +215,8 @@ static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    src += (4 * src_stride);
 | 
			
		||||
    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
 | 
			
		||||
    ref += (4 * ref_stride);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
 | 
			
		||||
                src0, src1, ref0, ref1);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
 | 
			
		||||
                ref0, ref1);
 | 
			
		||||
    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
 | 
			
		||||
 | 
			
		||||
    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
 | 
			
		||||
@@ -473,8 +474,8 @@ static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    src += (4 * src_stride);
 | 
			
		||||
    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
 | 
			
		||||
    ref += (4 * ref_stride);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
 | 
			
		||||
                src0, src1, ref0, ref1);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
 | 
			
		||||
                ref0, ref1);
 | 
			
		||||
    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
 | 
			
		||||
 | 
			
		||||
    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
 | 
			
		||||
@@ -794,8 +795,8 @@ static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
                               const uint8_t *const aref_ptr[],
 | 
			
		||||
                               int32_t ref_stride,
 | 
			
		||||
                               int32_t height, uint32_t *sad_array) {
 | 
			
		||||
                               int32_t ref_stride, int32_t height,
 | 
			
		||||
                               uint32_t *sad_array) {
 | 
			
		||||
  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
 | 
			
		||||
  int32_t ht_cnt;
 | 
			
		||||
  uint32_t src0, src1, src2, src3;
 | 
			
		||||
@@ -855,8 +856,8 @@ static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
                               const uint8_t *const aref_ptr[],
 | 
			
		||||
                               int32_t ref_stride,
 | 
			
		||||
                               int32_t height, uint32_t *sad_array) {
 | 
			
		||||
                               int32_t ref_stride, int32_t height,
 | 
			
		||||
                               uint32_t *sad_array) {
 | 
			
		||||
  int32_t ht_cnt;
 | 
			
		||||
  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
 | 
			
		||||
  v16u8 src0, src1, src2, src3;
 | 
			
		||||
@@ -906,8 +907,8 @@ static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
                                const uint8_t *const aref_ptr[],
 | 
			
		||||
                                int32_t ref_stride,
 | 
			
		||||
                                int32_t height, uint32_t *sad_array) {
 | 
			
		||||
                                int32_t ref_stride, int32_t height,
 | 
			
		||||
                                uint32_t *sad_array) {
 | 
			
		||||
  int32_t ht_cnt;
 | 
			
		||||
  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
 | 
			
		||||
  v16u8 src, ref0, ref1, ref2, ref3, diff;
 | 
			
		||||
@@ -971,8 +972,8 @@ static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                const uint8_t *const aref_ptr[],
 | 
			
		||||
                                int32_t ref_stride,
 | 
			
		||||
                                int32_t height, uint32_t *sad_array) {
 | 
			
		||||
                                int32_t ref_stride, int32_t height,
 | 
			
		||||
                                uint32_t *sad_array) {
 | 
			
		||||
  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
 | 
			
		||||
  int32_t ht_cnt;
 | 
			
		||||
  v16u8 src0, src1, ref0, ref1;
 | 
			
		||||
@@ -1015,8 +1016,8 @@ static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                                const uint8_t *const aref_ptr[],
 | 
			
		||||
                                int32_t ref_stride,
 | 
			
		||||
                                int32_t height, uint32_t *sad_array) {
 | 
			
		||||
                                int32_t ref_stride, int32_t height,
 | 
			
		||||
                                uint32_t *sad_array) {
 | 
			
		||||
  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
 | 
			
		||||
  int32_t ht_cnt;
 | 
			
		||||
  v16u8 src0, src1, src2, src3;
 | 
			
		||||
@@ -1114,8 +1115,8 @@ static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    ref += (4 * ref_stride);
 | 
			
		||||
    LD_UB2(sec_pred, 16, pred0, pred1);
 | 
			
		||||
    sec_pred += 32;
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
 | 
			
		||||
                src0, src1, ref0, ref1);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
 | 
			
		||||
                ref0, ref1);
 | 
			
		||||
    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
 | 
			
		||||
    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
 | 
			
		||||
  }
 | 
			
		||||
@@ -1213,8 +1214,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    ref += ref_stride;
 | 
			
		||||
    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
 | 
			
		||||
    sec_pred += 64;
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
 | 
			
		||||
                comp0, comp1, comp2, comp3);
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
 | 
			
		||||
                comp1, comp2, comp3);
 | 
			
		||||
    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
 | 
			
		||||
    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 | 
			
		||||
 | 
			
		||||
@@ -1224,8 +1225,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    ref += ref_stride;
 | 
			
		||||
    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
 | 
			
		||||
    sec_pred += 64;
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
 | 
			
		||||
                comp0, comp1, comp2, comp3);
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
 | 
			
		||||
                comp1, comp2, comp3);
 | 
			
		||||
    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
 | 
			
		||||
    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 | 
			
		||||
 | 
			
		||||
@@ -1235,8 +1236,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    ref += ref_stride;
 | 
			
		||||
    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
 | 
			
		||||
    sec_pred += 64;
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
 | 
			
		||||
                comp0, comp1, comp2, comp3);
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
 | 
			
		||||
                comp1, comp2, comp3);
 | 
			
		||||
    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
 | 
			
		||||
    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 | 
			
		||||
 | 
			
		||||
@@ -1246,8 +1247,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    ref += ref_stride;
 | 
			
		||||
    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
 | 
			
		||||
    sec_pred += 64;
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
 | 
			
		||||
                comp0, comp1, comp2, comp3);
 | 
			
		||||
    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
 | 
			
		||||
                comp1, comp2, comp3);
 | 
			
		||||
    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
 | 
			
		||||
    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 | 
			
		||||
  }
 | 
			
		||||
@@ -1397,40 +1398,40 @@ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
 | 
			
		||||
  uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
 | 
			
		||||
                                       const uint8_t *ref, int32_t ref_stride, \
 | 
			
		||||
                                       const uint8_t *second_pred) {           \
 | 
			
		||||
  return avgsad_4width_msa(src, src_stride, ref, ref_stride,                  \
 | 
			
		||||
                           height, second_pred);                              \
 | 
			
		||||
    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
 | 
			
		||||
                             second_pred);                                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
 | 
			
		||||
  uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
 | 
			
		||||
                                       const uint8_t *ref, int32_t ref_stride, \
 | 
			
		||||
                                       const uint8_t *second_pred) {           \
 | 
			
		||||
  return avgsad_8width_msa(src, src_stride, ref, ref_stride,                  \
 | 
			
		||||
                           height, second_pred);                              \
 | 
			
		||||
    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
 | 
			
		||||
                             second_pred);                                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
 | 
			
		||||
uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
 | 
			
		||||
                                      const uint8_t *ref, int32_t ref_stride,  \
 | 
			
		||||
                                      const uint8_t *second_pred) {            \
 | 
			
		||||
  return avgsad_16width_msa(src, src_stride, ref, ref_stride,                  \
 | 
			
		||||
                            height, second_pred);                              \
 | 
			
		||||
  uint32_t vpx_sad16x##height##_avg_msa(                                \
 | 
			
		||||
      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
 | 
			
		||||
      int32_t ref_stride, const uint8_t *second_pred) {                 \
 | 
			
		||||
    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
 | 
			
		||||
                              second_pred);                             \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
 | 
			
		||||
uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
 | 
			
		||||
                                      const uint8_t *ref, int32_t ref_stride,  \
 | 
			
		||||
                                      const uint8_t *second_pred) {            \
 | 
			
		||||
  return avgsad_32width_msa(src, src_stride, ref, ref_stride,                  \
 | 
			
		||||
                            height, second_pred);                              \
 | 
			
		||||
  uint32_t vpx_sad32x##height##_avg_msa(                                \
 | 
			
		||||
      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
 | 
			
		||||
      int32_t ref_stride, const uint8_t *second_pred) {                 \
 | 
			
		||||
    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
 | 
			
		||||
                              second_pred);                             \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
 | 
			
		||||
uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
 | 
			
		||||
                                      const uint8_t *ref, int32_t ref_stride,  \
 | 
			
		||||
                                      const uint8_t *second_pred) {            \
 | 
			
		||||
  return avgsad_64width_msa(src, src_stride, ref, ref_stride,                  \
 | 
			
		||||
                            height, second_pred);                              \
 | 
			
		||||
  uint32_t vpx_sad64x##height##_avg_msa(                                \
 | 
			
		||||
      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
 | 
			
		||||
      int32_t ref_stride, const uint8_t *second_pred) {                 \
 | 
			
		||||
    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
 | 
			
		||||
                              second_pred);                             \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
// 64x64
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -68,8 +68,8 @@ static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
 | 
			
		||||
    src += (8 * src_stride);
 | 
			
		||||
 | 
			
		||||
    LD_SB8(pred, pred_stride,
 | 
			
		||||
           pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
 | 
			
		||||
    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
 | 
			
		||||
           pred7);
 | 
			
		||||
    pred += (8 * pred_stride);
 | 
			
		||||
 | 
			
		||||
    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
 | 
			
		||||
@@ -226,31 +226,31 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_subtract_block_msa(int32_t rows, int32_t cols,
 | 
			
		||||
                            int16_t *diff_ptr, ptrdiff_t diff_stride,
 | 
			
		||||
                            const uint8_t *src_ptr, ptrdiff_t src_stride,
 | 
			
		||||
                            const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
 | 
			
		||||
void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
 | 
			
		||||
                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
 | 
			
		||||
                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
 | 
			
		||||
                            ptrdiff_t pred_stride) {
 | 
			
		||||
  if (rows == cols) {
 | 
			
		||||
    switch (rows) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
 | 
			
		||||
                        diff_ptr, diff_stride);
 | 
			
		||||
        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
 | 
			
		||||
                        diff_stride);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
 | 
			
		||||
                        diff_ptr, diff_stride);
 | 
			
		||||
        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
 | 
			
		||||
                        diff_stride);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
 | 
			
		||||
                          diff_ptr, diff_stride);
 | 
			
		||||
        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
 | 
			
		||||
                          diff_stride);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
 | 
			
		||||
                          diff_ptr, diff_stride);
 | 
			
		||||
        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
 | 
			
		||||
                          diff_stride);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
 | 
			
		||||
                          diff_ptr, diff_stride);
 | 
			
		||||
        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
 | 
			
		||||
                          diff_stride);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,8 @@
 | 
			
		||||
 | 
			
		||||
#include "vpx_dsp/mips/macros_msa.h"
 | 
			
		||||
 | 
			
		||||
#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {      \
 | 
			
		||||
#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
 | 
			
		||||
  {                                                           \
 | 
			
		||||
    v8i16 k0_m = __msa_fill_h(cnst0);                         \
 | 
			
		||||
    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
 | 
			
		||||
                                                              \
 | 
			
		||||
@@ -31,24 +32,26 @@
 | 
			
		||||
    out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,      \
 | 
			
		||||
                              dst0, dst1, dst2, dst3) {                    \
 | 
			
		||||
#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0,   \
 | 
			
		||||
                              dst1, dst2, dst3)                               \
 | 
			
		||||
  {                                                                           \
 | 
			
		||||
    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                  \
 | 
			
		||||
    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                  \
 | 
			
		||||
                                                                              \
 | 
			
		||||
  DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                      \
 | 
			
		||||
              tp0_m, tp2_m, tp3_m, tp4_m);                                 \
 | 
			
		||||
  DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                      \
 | 
			
		||||
              tp5_m, tp6_m, tp7_m, tp8_m);                                 \
 | 
			
		||||
    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m,  \
 | 
			
		||||
                tp4_m);                                                       \
 | 
			
		||||
    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m,  \
 | 
			
		||||
                tp8_m);                                                       \
 | 
			
		||||
    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);      \
 | 
			
		||||
    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);      \
 | 
			
		||||
    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                  \
 | 
			
		||||
    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                  \
 | 
			
		||||
  PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,      \
 | 
			
		||||
              dst0, dst1, dst2, dst3);                                     \
 | 
			
		||||
    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
 | 
			
		||||
                dst1, dst2, dst3);                                            \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({       \
 | 
			
		||||
#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)           \
 | 
			
		||||
  ({                                                   \
 | 
			
		||||
    v8i16 dst_m;                                       \
 | 
			
		||||
    v4i32 tp0_m, tp1_m;                                \
 | 
			
		||||
                                                       \
 | 
			
		||||
@@ -59,34 +62,34 @@
 | 
			
		||||
    dst_m;                                             \
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
#define MADD_SHORT(m0, m1, c0, c1, res0, res1) {                    \
 | 
			
		||||
#define MADD_SHORT(m0, m1, c0, c1, res0, res1)                              \
 | 
			
		||||
  {                                                                         \
 | 
			
		||||
    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                               \
 | 
			
		||||
    v8i16 madd_s0_m, madd_s1_m;                                             \
 | 
			
		||||
                                                                            \
 | 
			
		||||
    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                              \
 | 
			
		||||
  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,           \
 | 
			
		||||
              c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);  \
 | 
			
		||||
    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
 | 
			
		||||
                madd0_m, madd1_m, madd2_m, madd3_m);                        \
 | 
			
		||||
    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);        \
 | 
			
		||||
    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);            \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,         \
 | 
			
		||||
                out0, out1, out2, out3) {                               \
 | 
			
		||||
#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,   \
 | 
			
		||||
                out2, out3)                                                   \
 | 
			
		||||
  {                                                                           \
 | 
			
		||||
    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
 | 
			
		||||
    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                         \
 | 
			
		||||
                                                                              \
 | 
			
		||||
    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                            \
 | 
			
		||||
    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                            \
 | 
			
		||||
  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
 | 
			
		||||
              cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
 | 
			
		||||
  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
 | 
			
		||||
              m4_m, m5_m, tmp3_m, tmp2_m);                              \
 | 
			
		||||
    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
 | 
			
		||||
                cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
 | 
			
		||||
    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
 | 
			
		||||
    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
 | 
			
		||||
    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                      \
 | 
			
		||||
  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
 | 
			
		||||
              cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
 | 
			
		||||
  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
 | 
			
		||||
              m4_m, m5_m, tmp3_m, tmp2_m);                              \
 | 
			
		||||
    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
 | 
			
		||||
                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
 | 
			
		||||
    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
 | 
			
		||||
    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
 | 
			
		||||
    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                      \
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -11,7 +11,8 @@
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
#include "vpx_dsp/mips/macros_msa.h"
 | 
			
		||||
 | 
			
		||||
#define CALC_MSE_B(src, ref, var) {                                \
 | 
			
		||||
#define CALC_MSE_B(src, ref, var)                                   \
 | 
			
		||||
  {                                                                 \
 | 
			
		||||
    v16u8 src_l0_m, src_l1_m;                                       \
 | 
			
		||||
    v8i16 res_l0_m, res_l1_m;                                       \
 | 
			
		||||
                                                                    \
 | 
			
		||||
@@ -20,7 +21,8 @@
 | 
			
		||||
    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
 | 
			
		||||
#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
 | 
			
		||||
  {                                                                 \
 | 
			
		||||
    v16u8 src_l0_m, src_l1_m;                                       \
 | 
			
		||||
    v8i16 res_l0_m, res_l1_m;                                       \
 | 
			
		||||
                                                                    \
 | 
			
		||||
@@ -31,8 +33,7 @@
 | 
			
		||||
    sub += res_l0_m + res_l1_m;                                     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define VARIANCE_WxH(sse, diff, shift) \
 | 
			
		||||
  sse - (((uint32_t)diff * diff) >> shift)
 | 
			
		||||
#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
 | 
			
		||||
 | 
			
		||||
#define VARIANCE_LARGE_WxH(sse, diff, shift) \
 | 
			
		||||
  sse - (((int64_t)diff * diff) >> shift)
 | 
			
		||||
@@ -80,8 +81,8 @@ static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
 | 
			
		||||
    ref_ptr += (4 * ref_stride);
 | 
			
		||||
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
 | 
			
		||||
                src0, src1, ref0, ref1);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
 | 
			
		||||
                ref0, ref1);
 | 
			
		||||
    CALC_MSE_AVG_B(src0, ref0, var, avg);
 | 
			
		||||
    CALC_MSE_AVG_B(src1, ref1, var, avg);
 | 
			
		||||
  }
 | 
			
		||||
@@ -370,8 +371,8 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
 | 
			
		||||
    ref_ptr += (4 * ref_stride);
 | 
			
		||||
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
 | 
			
		||||
                src0, src1, ref0, ref1);
 | 
			
		||||
    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
 | 
			
		||||
                ref0, ref1);
 | 
			
		||||
    CALC_MSE_B(src0, ref0, var);
 | 
			
		||||
    CALC_MSE_B(src1, ref1, var);
 | 
			
		||||
  }
 | 
			
		||||
@@ -527,15 +528,13 @@ uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
 | 
			
		||||
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
 | 
			
		||||
 | 
			
		||||
#define VPX_VARIANCE_WDXHT_MSA(wd, ht)                                         \
 | 
			
		||||
uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src,           \
 | 
			
		||||
                                       int32_t src_stride,           \
 | 
			
		||||
                                       const uint8_t *ref,           \
 | 
			
		||||
                                       int32_t ref_stride,           \
 | 
			
		||||
                                       uint32_t *sse) {              \
 | 
			
		||||
  uint32_t vpx_variance##wd##x##ht##_msa(                                      \
 | 
			
		||||
      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
 | 
			
		||||
      int32_t ref_stride, uint32_t *sse) {                                     \
 | 
			
		||||
    int32_t diff;                                                              \
 | 
			
		||||
                                                                               \
 | 
			
		||||
  *sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride,  \
 | 
			
		||||
                                  ht, &diff);                        \
 | 
			
		||||
    *sse =                                                                     \
 | 
			
		||||
        sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
 | 
			
		||||
  }
 | 
			
		||||
@@ -585,8 +584,7 @@ uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                        const uint8_t *ref, int32_t ref_stride,
 | 
			
		||||
                        uint32_t *sse) {
 | 
			
		||||
                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
 | 
			
		||||
  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
 | 
			
		||||
 | 
			
		||||
  return *sse;
 | 
			
		||||
@@ -617,17 +615,15 @@ uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                       const uint8_t *ref, int32_t ref_stride,
 | 
			
		||||
                       uint32_t *sse, int32_t *sum) {
 | 
			
		||||
                       const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
 | 
			
		||||
                       int32_t *sum) {
 | 
			
		||||
  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                         const uint8_t *ref, int32_t ref_stride,
 | 
			
		||||
                         uint32_t *sse, int32_t *sum) {
 | 
			
		||||
                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
 | 
			
		||||
                         int32_t *sum) {
 | 
			
		||||
  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
uint32_t vpx_get_mb_ss_msa(const int16_t *src) {
 | 
			
		||||
  return get_mb_ss_msa(src);
 | 
			
		||||
}
 | 
			
		||||
uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
 | 
			
		||||
 
 | 
			
		||||
@@ -13,8 +13,7 @@
 | 
			
		||||
#include "vpx_dsp/mips/vpx_convolve_msa.h"
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 | 
			
		||||
@@ -48,8 +47,7 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 | 
			
		||||
@@ -92,10 +90,8 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
 | 
			
		||||
@@ -105,10 +101,8 @@ static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  int32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 | 
			
		||||
@@ -136,18 +130,16 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
 | 
			
		||||
    SAT_SH4_SH(out0, out1, out2, out3, 7);
 | 
			
		||||
    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
 | 
			
		||||
                            dst, dst_stride);
 | 
			
		||||
    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
 | 
			
		||||
                            dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  int32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 | 
			
		||||
  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
 | 
			
		||||
@@ -199,11 +191,9 @@ static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 | 
			
		||||
  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
 | 
			
		||||
@@ -256,11 +246,9 @@ static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt, cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
 | 
			
		||||
  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
 | 
			
		||||
@@ -318,8 +306,7 @@ static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, mask;
 | 
			
		||||
@@ -344,8 +331,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
 | 
			
		||||
@@ -378,10 +364,8 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
 | 
			
		||||
@@ -391,8 +375,7 @@ static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, mask;
 | 
			
		||||
@@ -412,16 +395,13 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
 | 
			
		||||
              vec2, vec3);
 | 
			
		||||
  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
 | 
			
		||||
  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
 | 
			
		||||
                     dst, dst_stride);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
 | 
			
		||||
                     dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
                                                  int32_t src_stride,
 | 
			
		||||
                                                  uint8_t *dst,
 | 
			
		||||
                                                  int32_t dst_stride,
 | 
			
		||||
                                                  int8_t *filter,
 | 
			
		||||
                                                  int32_t height) {
 | 
			
		||||
static void common_hz_2t_and_aver_dst_8x8mult_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter, int32_t height) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, mask;
 | 
			
		||||
  v16u8 filt0, dst0, dst1, dst2, dst3;
 | 
			
		||||
  v8u16 vec0, vec1, vec2, vec3, filt;
 | 
			
		||||
@@ -442,8 +422,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
  LD_SB4(src, src_stride, src0, src1, src2, src3);
 | 
			
		||||
  src += (4 * src_stride);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
 | 
			
		||||
                     dst, dst_stride);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
 | 
			
		||||
                     dst_stride);
 | 
			
		||||
  dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 | 
			
		||||
@@ -452,8 +432,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
              vec2, vec3);
 | 
			
		||||
  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
 | 
			
		||||
  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
 | 
			
		||||
                     dst, dst_stride);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
 | 
			
		||||
                     dst_stride);
 | 
			
		||||
  dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
  if (16 == height) {
 | 
			
		||||
@@ -467,8 +447,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
    LD_SB4(src, src_stride, src0, src1, src2, src3);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
 | 
			
		||||
                       dst, dst_stride);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
 | 
			
		||||
                       dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
 | 
			
		||||
@@ -477,16 +457,14 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
                vec2, vec3);
 | 
			
		||||
    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
 | 
			
		||||
                       dst, dst_stride);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
 | 
			
		||||
                       dst_stride);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
 | 
			
		||||
@@ -497,11 +475,9 @@ static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
 | 
			
		||||
  v16u8 filt0, dst0, dst1, dst2, dst3;
 | 
			
		||||
@@ -566,11 +542,9 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
 | 
			
		||||
  v16u8 filt0, dst0, dst1, dst2, dst3;
 | 
			
		||||
@@ -617,11 +591,9 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
 | 
			
		||||
  v16u8 filt0, dst0, dst1, dst2, dst3;
 | 
			
		||||
@@ -662,8 +634,8 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                 uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                 const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                                 const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                                 int w, int h) {
 | 
			
		||||
                                 const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                                 int h) {
 | 
			
		||||
  int8_t cnt, filt_hor[8];
 | 
			
		||||
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
@@ -676,67 +648,55 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  if (((const int32_t *)filter_x)[0] == 0) {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         &filt_hor[3], h);
 | 
			
		||||
        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         &filt_hor[3], h);
 | 
			
		||||
        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          &filt_hor[3], h);
 | 
			
		||||
        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          &filt_hor[3], h);
 | 
			
		||||
        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          &filt_hor[3], h);
 | 
			
		||||
        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                  filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                                  w, h);
 | 
			
		||||
        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                  x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         filt_hor, h);
 | 
			
		||||
        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         filt_hor, h);
 | 
			
		||||
        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          filt_hor, h);
 | 
			
		||||
        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          filt_hor, h);
 | 
			
		||||
        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          filt_hor, h);
 | 
			
		||||
        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                  filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                                  w, h);
 | 
			
		||||
        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                  x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -12,13 +12,9 @@
 | 
			
		||||
#include "./vpx_dsp_rtcd.h"
 | 
			
		||||
#include "vpx_dsp/mips/vpx_convolve_msa.h"
 | 
			
		||||
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
                                                  int32_t src_stride,
 | 
			
		||||
                                                  uint8_t *dst,
 | 
			
		||||
                                                  int32_t dst_stride,
 | 
			
		||||
                                                  int8_t *filter_horiz,
 | 
			
		||||
                                                  int8_t *filter_vert,
 | 
			
		||||
                                                  int32_t height) {
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 | 
			
		||||
  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
 | 
			
		||||
@@ -64,15 +60,15 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
    src += (4 * src_stride);
 | 
			
		||||
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
 | 
			
		||||
    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
 | 
			
		||||
    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
 | 
			
		||||
    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
 | 
			
		||||
    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
 | 
			
		||||
@@ -94,13 +90,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
                                                  int32_t src_stride,
 | 
			
		||||
                                                  uint8_t *dst,
 | 
			
		||||
                                                  int32_t dst_stride,
 | 
			
		||||
                                                  int8_t *filter_horiz,
 | 
			
		||||
                                                  int8_t *filter_vert,
 | 
			
		||||
                                                  int32_t height) {
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 | 
			
		||||
  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
 | 
			
		||||
@@ -154,20 +146,20 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
 | 
			
		||||
    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
 | 
			
		||||
    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
 | 
			
		||||
    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
 | 
			
		||||
    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
@@ -180,8 +172,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
 | 
			
		||||
    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
 | 
			
		||||
    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
 | 
			
		||||
                            dst, dst_stride);
 | 
			
		||||
    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
 | 
			
		||||
                            dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
    hz_out6 = hz_out10;
 | 
			
		||||
@@ -194,13 +186,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert,
 | 
			
		||||
                                                   int32_t height) {
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  int32_t multiple8_cnt;
 | 
			
		||||
  for (multiple8_cnt = 2; multiple8_cnt--;) {
 | 
			
		||||
    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
@@ -210,13 +198,9 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert,
 | 
			
		||||
                                                   int32_t height) {
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  int32_t multiple8_cnt;
 | 
			
		||||
  for (multiple8_cnt = 4; multiple8_cnt--;) {
 | 
			
		||||
    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
@@ -226,13 +210,9 @@ static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert,
 | 
			
		||||
                                                   int32_t height) {
 | 
			
		||||
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  int32_t multiple8_cnt;
 | 
			
		||||
  for (multiple8_cnt = 8; multiple8_cnt--;) {
 | 
			
		||||
    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
@@ -242,12 +222,9 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, mask;
 | 
			
		||||
  v16u8 filt_hz, filt_vt, vec0, vec1;
 | 
			
		||||
  v16u8 dst0, dst1, dst2, dst3, res0, res1;
 | 
			
		||||
@@ -280,12 +257,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
 | 
			
		||||
  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
 | 
			
		||||
  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 | 
			
		||||
@@ -316,29 +290,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
 | 
			
		||||
 | 
			
		||||
  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 | 
			
		||||
  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
 | 
			
		||||
             dst4, dst6);
 | 
			
		||||
  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
 | 
			
		||||
             dst6);
 | 
			
		||||
  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 | 
			
		||||
  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
 | 
			
		||||
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
 | 
			
		||||
              tmp0, tmp1, tmp2, tmp3);
 | 
			
		||||
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
 | 
			
		||||
              tmp1, tmp2, tmp3);
 | 
			
		||||
  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
 | 
			
		||||
              res2, res3);
 | 
			
		||||
  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
 | 
			
		||||
              res2, res3);
 | 
			
		||||
  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
 | 
			
		||||
              res3);
 | 
			
		||||
  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
 | 
			
		||||
              res3);
 | 
			
		||||
  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
  dst += (4 * dst_stride);
 | 
			
		||||
  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
                                                  int32_t src_stride,
 | 
			
		||||
                                                  uint8_t *dst,
 | 
			
		||||
                                                  int32_t dst_stride,
 | 
			
		||||
                                                  int8_t *filter_horiz,
 | 
			
		||||
                                                  int8_t *filter_vert,
 | 
			
		||||
                                                  int32_t height) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                           filter_horiz, filter_vert);
 | 
			
		||||
@@ -348,12 +318,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, mask;
 | 
			
		||||
  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
 | 
			
		||||
  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
 | 
			
		||||
@@ -390,17 +357,13 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
 | 
			
		||||
  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 | 
			
		||||
 | 
			
		||||
  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
 | 
			
		||||
                     dst, dst_stride);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
 | 
			
		||||
                     dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
                                                       int32_t src_stride,
 | 
			
		||||
                                                       uint8_t *dst,
 | 
			
		||||
                                                       int32_t dst_stride,
 | 
			
		||||
                                                       int8_t *filter_horiz,
 | 
			
		||||
                                                       int8_t *filter_vert,
 | 
			
		||||
                                                       int32_t height) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, mask;
 | 
			
		||||
  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
 | 
			
		||||
@@ -445,36 +408,27 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
 | 
			
		||||
    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
 | 
			
		||||
                       dst, dst_stride);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
 | 
			
		||||
                       dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
                                                  int32_t src_stride,
 | 
			
		||||
                                                  uint8_t *dst,
 | 
			
		||||
                                                  int32_t dst_stride,
 | 
			
		||||
                                                  int8_t *filter_horiz,
 | 
			
		||||
                                                  int8_t *filter_vert,
 | 
			
		||||
                                                  int32_t height) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                           filter_horiz, filter_vert);
 | 
			
		||||
  } else {
 | 
			
		||||
    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                               filter_horiz, filter_vert,
 | 
			
		||||
                                               height);
 | 
			
		||||
    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
 | 
			
		||||
        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert,
 | 
			
		||||
                                                   int32_t height) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
 | 
			
		||||
  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
 | 
			
		||||
@@ -536,13 +490,9 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert,
 | 
			
		||||
                                                   int32_t height) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  int32_t multiple8_cnt;
 | 
			
		||||
  for (multiple8_cnt = 2; multiple8_cnt--;) {
 | 
			
		||||
    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
@@ -552,13 +502,9 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter_horiz,
 | 
			
		||||
                                                   int8_t *filter_vert,
 | 
			
		||||
                                                   int32_t height) {
 | 
			
		||||
static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  int32_t multiple8_cnt;
 | 
			
		||||
  for (multiple8_cnt = 4; multiple8_cnt--;) {
 | 
			
		||||
    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
@@ -571,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                           uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                           const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                           const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                           int w, int h) {
 | 
			
		||||
                           const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                           int h) {
 | 
			
		||||
  int8_t cnt, filt_hor[8], filt_ver[8];
 | 
			
		||||
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
@@ -589,72 +535,69 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
      ((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                              dst, (int32_t)dst_stride,
 | 
			
		||||
                                              &filt_hor[3], &filt_ver[3], h);
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                              (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                              &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                              dst, (int32_t)dst_stride,
 | 
			
		||||
                                              &filt_hor[3], &filt_ver[3], h);
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                              (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                              &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                               dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                               (int32_t)dst_stride,
 | 
			
		||||
                                               &filt_hor[3], &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                               dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                               (int32_t)dst_stride,
 | 
			
		||||
                                               &filt_hor[3], &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                               dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                               (int32_t)dst_stride,
 | 
			
		||||
                                               &filt_hor[3], &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                            filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                            w, h);
 | 
			
		||||
        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                            x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  } else if (((const int32_t *)filter_x)[0] == 0 ||
 | 
			
		||||
             ((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                        filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                        w, h);
 | 
			
		||||
    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
 | 
			
		||||
                        filter_y, y_step_q4, w, h);
 | 
			
		||||
  } else {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                              dst, (int32_t)dst_stride,
 | 
			
		||||
                                              filt_hor, filt_ver, h);
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                              (int32_t)dst_stride, filt_hor,
 | 
			
		||||
                                              filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                              dst, (int32_t)dst_stride,
 | 
			
		||||
                                              filt_hor, filt_ver, h);
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                              (int32_t)dst_stride, filt_hor,
 | 
			
		||||
                                              filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                               dst, (int32_t)dst_stride,
 | 
			
		||||
                                               filt_hor, filt_ver, h);
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                               (int32_t)dst_stride, filt_hor,
 | 
			
		||||
                                               filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                               dst, (int32_t)dst_stride,
 | 
			
		||||
                                               filt_hor, filt_ver, h);
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                               (int32_t)dst_stride, filt_hor,
 | 
			
		||||
                                               filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                               dst, (int32_t)dst_stride,
 | 
			
		||||
                                               filt_hor, filt_ver, h);
 | 
			
		||||
        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                               (int32_t)dst_stride, filt_hor,
 | 
			
		||||
                                               filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                            filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                            w, h);
 | 
			
		||||
        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                            x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -13,10 +13,8 @@
 | 
			
		||||
#include "vpx_dsp/mips/vpx_convolve_msa.h"
 | 
			
		||||
 | 
			
		||||
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 | 
			
		||||
@@ -73,10 +71,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 | 
			
		||||
@@ -106,18 +102,18 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
    XORI_B4_128_SB(src7, src8, src9, src10);
 | 
			
		||||
    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
 | 
			
		||||
               src87_r, src98_r, src109_r);
 | 
			
		||||
    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
 | 
			
		||||
                               filt1, filt2, filt3);
 | 
			
		||||
    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
 | 
			
		||||
                               filt1, filt2, filt3);
 | 
			
		||||
    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
 | 
			
		||||
                               filt1, filt2, filt3);
 | 
			
		||||
    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
 | 
			
		||||
                               filt2, filt3);
 | 
			
		||||
    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
 | 
			
		||||
                               filt2, filt3);
 | 
			
		||||
    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
 | 
			
		||||
                               filt2, filt3);
 | 
			
		||||
    out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
 | 
			
		||||
                               filt1, filt2, filt3);
 | 
			
		||||
    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
 | 
			
		||||
    SAT_SH4_SH(out0, out1, out2, out3, 7);
 | 
			
		||||
    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
 | 
			
		||||
                            dst, dst_stride);
 | 
			
		||||
    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
 | 
			
		||||
                            dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
    src10_r = src54_r;
 | 
			
		||||
@@ -130,13 +126,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
 | 
			
		||||
                                                   int32_t src_stride,
 | 
			
		||||
                                                   uint8_t *dst,
 | 
			
		||||
                                                   int32_t dst_stride,
 | 
			
		||||
                                                   int8_t *filter,
 | 
			
		||||
                                                   int32_t height,
 | 
			
		||||
                                                   int32_t width) {
 | 
			
		||||
static void common_vt_8t_and_aver_dst_16w_mult_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter, int32_t height, int32_t width) {
 | 
			
		||||
  const uint8_t *src_tmp;
 | 
			
		||||
  uint8_t *dst_tmp;
 | 
			
		||||
  uint32_t loop_cnt, cnt;
 | 
			
		||||
@@ -227,38 +219,31 @@ static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                         filter, height, 16);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                         filter, height, 32);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                         filter, height, 64);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4;
 | 
			
		||||
@@ -292,8 +277,7 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 | 
			
		||||
@@ -311,15 +295,15 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
  src8 = LD_SB(src);
 | 
			
		||||
 | 
			
		||||
  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 | 
			
		||||
  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
 | 
			
		||||
             dst2, dst3);
 | 
			
		||||
  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
 | 
			
		||||
             dst3);
 | 
			
		||||
  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
 | 
			
		||||
  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
 | 
			
		||||
             src32_r, src43_r);
 | 
			
		||||
  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 | 
			
		||||
             src76_r, src87_r);
 | 
			
		||||
  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 | 
			
		||||
             src87_r, src76_r, src2110, src4332, src6554, src8776);
 | 
			
		||||
  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
 | 
			
		||||
             src76_r, src2110, src4332, src6554, src8776);
 | 
			
		||||
  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
 | 
			
		||||
              tmp0, tmp1, tmp2, tmp3);
 | 
			
		||||
  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
@@ -331,10 +315,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
 | 
			
		||||
@@ -344,8 +326,7 @@ static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter) {
 | 
			
		||||
  v16u8 src0, src1, src2, src3, src4;
 | 
			
		||||
@@ -364,16 +345,13 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
 | 
			
		||||
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
 | 
			
		||||
              tmp2, tmp3);
 | 
			
		||||
  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
 | 
			
		||||
                     dst, dst_stride);
 | 
			
		||||
  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
 | 
			
		||||
                     dst_stride);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
                                                  int32_t src_stride,
 | 
			
		||||
                                                  uint8_t *dst,
 | 
			
		||||
                                                  int32_t dst_stride,
 | 
			
		||||
                                                  int8_t *filter,
 | 
			
		||||
                                                  int32_t height) {
 | 
			
		||||
static void common_vt_2t_and_aver_dst_8x8mult_msa(
 | 
			
		||||
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
 | 
			
		||||
    int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
 | 
			
		||||
  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
 | 
			
		||||
@@ -393,22 +371,22 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
    src += (8 * src_stride);
 | 
			
		||||
    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
 | 
			
		||||
 | 
			
		||||
    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
 | 
			
		||||
               vec2, vec3);
 | 
			
		||||
    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
 | 
			
		||||
               vec6, vec7);
 | 
			
		||||
    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
 | 
			
		||||
               vec3);
 | 
			
		||||
    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
 | 
			
		||||
               vec7);
 | 
			
		||||
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3);
 | 
			
		||||
    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
 | 
			
		||||
                       dst, dst_stride);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
 | 
			
		||||
                       dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3);
 | 
			
		||||
    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
 | 
			
		||||
                       dst, dst_stride);
 | 
			
		||||
    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
 | 
			
		||||
                       dst_stride);
 | 
			
		||||
    dst += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
    src0 = src8;
 | 
			
		||||
@@ -416,10 +394,8 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
                                             int32_t src_stride,
 | 
			
		||||
                                             uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride,
 | 
			
		||||
                                             int8_t *filter,
 | 
			
		||||
                                             int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                             int32_t dst_stride, int8_t *filter,
 | 
			
		||||
                                             int32_t height) {
 | 
			
		||||
  if (4 == height) {
 | 
			
		||||
    common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
 | 
			
		||||
@@ -430,11 +406,9 @@ static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
 | 
			
		||||
  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 | 
			
		||||
@@ -481,11 +455,9 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
 | 
			
		||||
  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 | 
			
		||||
@@ -554,11 +526,9 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
                                              int32_t src_stride,
 | 
			
		||||
                                              uint8_t *dst,
 | 
			
		||||
                                              int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                              int32_t dst_stride,
 | 
			
		||||
                                              int8_t *filter,
 | 
			
		||||
                                              int32_t height) {
 | 
			
		||||
                                              int8_t *filter, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16u8 src0, src1, src2, src3, src4, src5;
 | 
			
		||||
  v16u8 src6, src7, src8, src9, src10, src11, filt0;
 | 
			
		||||
@@ -636,8 +606,8 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
 | 
			
		||||
void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                                uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                                const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                                const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                                int w, int h) {
 | 
			
		||||
                                const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                                int h) {
 | 
			
		||||
  int8_t cnt, filt_ver[8];
 | 
			
		||||
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
@@ -650,68 +620,56 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  if (((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         &filt_ver[3], h);
 | 
			
		||||
        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         &filt_ver[3], h);
 | 
			
		||||
        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          &filt_ver[3], h);
 | 
			
		||||
        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          &filt_ver[3], h);
 | 
			
		||||
        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          &filt_ver[3], h);
 | 
			
		||||
        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                 filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                                 w, h);
 | 
			
		||||
        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                 x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         filt_ver, h);
 | 
			
		||||
        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                         dst, (int32_t)dst_stride,
 | 
			
		||||
                                         filt_ver, h);
 | 
			
		||||
        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                         (int32_t)dst_stride, filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          filt_ver, h);
 | 
			
		||||
        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, filt_ver, h);
 | 
			
		||||
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          filt_ver, h);
 | 
			
		||||
        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                          dst, (int32_t)dst_stride,
 | 
			
		||||
                                          filt_ver, h);
 | 
			
		||||
        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                          (int32_t)dst_stride, filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                                 filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                                 w, h);
 | 
			
		||||
        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                                 x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -355,8 +355,8 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
 | 
			
		||||
              vec6, vec7);
 | 
			
		||||
  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
 | 
			
		||||
  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
 | 
			
		||||
              res2, res3);
 | 
			
		||||
  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
 | 
			
		||||
              res3);
 | 
			
		||||
  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
  dst += (4 * dst_stride);
 | 
			
		||||
  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
@@ -622,8 +622,8 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                             uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                             const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                             const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                             int w, int h) {
 | 
			
		||||
                             const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                             int h) {
 | 
			
		||||
  int8_t cnt, filt_hor[8];
 | 
			
		||||
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
@@ -636,67 +636,55 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  if (((const int32_t *)filter_x)[0] == 0) {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hz_2t_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hz_2t_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hz_2t_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hz_2t_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hz_2t_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             &filt_hor[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                              filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                              w, h);
 | 
			
		||||
        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                              x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hz_8t_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hz_8t_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hz_8t_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hz_8t_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hz_8t_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             filt_hor, h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                              filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                              w, h);
 | 
			
		||||
        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                              x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -69,15 +69,15 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    XORI_B4_128_SB(src7, src8, src9, src10);
 | 
			
		||||
    src += (4 * src_stride);
 | 
			
		||||
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
 | 
			
		||||
    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
 | 
			
		||||
    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
 | 
			
		||||
    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
 | 
			
		||||
    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
 | 
			
		||||
@@ -151,20 +151,20 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
    XORI_B4_128_SB(src7, src8, src9, src10);
 | 
			
		||||
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
 | 
			
		||||
    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
 | 
			
		||||
    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
 | 
			
		||||
    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
 | 
			
		||||
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
 | 
			
		||||
                              filt_hz1, filt_hz2, filt_hz3);
 | 
			
		||||
    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
 | 
			
		||||
    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
 | 
			
		||||
                               filt_vt2, filt_vt3);
 | 
			
		||||
@@ -295,11 +295,11 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 | 
			
		||||
  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
 | 
			
		||||
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
 | 
			
		||||
              vec4, vec5, vec6, vec7);
 | 
			
		||||
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
 | 
			
		||||
              vec5, vec6, vec7);
 | 
			
		||||
  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
 | 
			
		||||
  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
 | 
			
		||||
              res2, res3);
 | 
			
		||||
  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
 | 
			
		||||
              res3);
 | 
			
		||||
  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
  dst += (4 * dst_stride);
 | 
			
		||||
  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 | 
			
		||||
@@ -361,12 +361,10 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
 | 
			
		||||
                                          int32_t src_stride,
 | 
			
		||||
                                          uint8_t *dst,
 | 
			
		||||
                                          int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                                          int32_t dst_stride,
 | 
			
		||||
                                          int8_t *filter_horiz,
 | 
			
		||||
                                          int8_t *filter_vert,
 | 
			
		||||
                                          int32_t height) {
 | 
			
		||||
                                          int8_t *filter_vert, int32_t height) {
 | 
			
		||||
  uint32_t loop_cnt;
 | 
			
		||||
  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
 | 
			
		||||
  v16u8 filt_hz, filt_vt, vec0;
 | 
			
		||||
@@ -542,11 +540,10 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                       uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                       const int16_t *filter_x, int32_t x_step_q4,
 | 
			
		||||
                       const int16_t *filter_y, int32_t y_step_q4,
 | 
			
		||||
                       int32_t w, int32_t h) {
 | 
			
		||||
void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 | 
			
		||||
                       ptrdiff_t dst_stride, const int16_t *filter_x,
 | 
			
		||||
                       int32_t x_step_q4, const int16_t *filter_y,
 | 
			
		||||
                       int32_t y_step_q4, int32_t w, int32_t h) {
 | 
			
		||||
  int8_t cnt, filt_hor[8], filt_ver[8];
 | 
			
		||||
 | 
			
		||||
  assert(x_step_q4 == 16);
 | 
			
		||||
@@ -563,72 +560,69 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
      ((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
 | 
			
		||||
        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                 &filt_ver[3], (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
 | 
			
		||||
        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                 &filt_ver[3], (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                  dst, (int32_t)dst_stride,
 | 
			
		||||
                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
 | 
			
		||||
        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                  (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                  &filt_ver[3], (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                  dst, (int32_t)dst_stride,
 | 
			
		||||
                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
 | 
			
		||||
        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                  (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                  &filt_ver[3], (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                  dst, (int32_t)dst_stride,
 | 
			
		||||
                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
 | 
			
		||||
        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                  (int32_t)dst_stride, &filt_hor[3],
 | 
			
		||||
                                  &filt_ver[3], (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                        filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                        w, h);
 | 
			
		||||
        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
 | 
			
		||||
                        filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  } else if (((const int32_t *)filter_x)[0] == 0 ||
 | 
			
		||||
             ((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    vpx_convolve8_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                    filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                    w, h);
 | 
			
		||||
    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
 | 
			
		||||
                    filter_y, y_step_q4, w, h);
 | 
			
		||||
  } else {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 filt_hor, filt_ver, (int32_t)h);
 | 
			
		||||
        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, filt_hor, filt_ver,
 | 
			
		||||
                                 (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                 dst, (int32_t)dst_stride,
 | 
			
		||||
                                 filt_hor, filt_ver, (int32_t)h);
 | 
			
		||||
        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                 (int32_t)dst_stride, filt_hor, filt_ver,
 | 
			
		||||
                                 (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                  dst, (int32_t)dst_stride,
 | 
			
		||||
                                  filt_hor, filt_ver, (int32_t)h);
 | 
			
		||||
        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                  (int32_t)dst_stride, filt_hor, filt_ver,
 | 
			
		||||
                                  (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                  dst, (int32_t)dst_stride,
 | 
			
		||||
                                  filt_hor, filt_ver, (int32_t)h);
 | 
			
		||||
        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                  (int32_t)dst_stride, filt_hor, filt_ver,
 | 
			
		||||
                                  (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                                  dst, (int32_t)dst_stride,
 | 
			
		||||
                                  filt_hor, filt_ver, (int32_t)h);
 | 
			
		||||
        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
 | 
			
		||||
                                  (int32_t)dst_stride, filt_hor, filt_ver,
 | 
			
		||||
                                  (int32_t)h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                        filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                        w, h);
 | 
			
		||||
        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
 | 
			
		||||
                        filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -222,11 +222,11 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
 | 
			
		||||
    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
 | 
			
		||||
    src_tmp += (7 * src_stride);
 | 
			
		||||
    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
 | 
			
		||||
               src32_r, src54_r, src21_r);
 | 
			
		||||
    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
 | 
			
		||||
               src54_r, src21_r);
 | 
			
		||||
    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 | 
			
		||||
    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
 | 
			
		||||
               src32_l, src54_l, src21_l);
 | 
			
		||||
    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
 | 
			
		||||
               src54_l, src21_l);
 | 
			
		||||
    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 | 
			
		||||
 | 
			
		||||
    for (loop_cnt = (height >> 2); loop_cnt--;) {
 | 
			
		||||
@@ -344,8 +344,8 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
             src32_r, src43_r);
 | 
			
		||||
  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
 | 
			
		||||
             src76_r, src87_r);
 | 
			
		||||
  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
 | 
			
		||||
             src87_r, src76_r, src2110, src4332, src6554, src8776);
 | 
			
		||||
  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
 | 
			
		||||
             src76_r, src2110, src4332, src6554, src8776);
 | 
			
		||||
  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
 | 
			
		||||
              tmp0, tmp1, tmp2, tmp3);
 | 
			
		||||
  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
@@ -407,10 +407,10 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
 | 
			
		||||
    src += (8 * src_stride);
 | 
			
		||||
 | 
			
		||||
    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
 | 
			
		||||
               vec2, vec3);
 | 
			
		||||
    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
 | 
			
		||||
               vec6, vec7);
 | 
			
		||||
    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
 | 
			
		||||
               vec3);
 | 
			
		||||
    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
 | 
			
		||||
               vec7);
 | 
			
		||||
    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
 | 
			
		||||
                tmp2, tmp3);
 | 
			
		||||
    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
 | 
			
		||||
@@ -629,8 +629,8 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
                            uint8_t *dst, ptrdiff_t dst_stride,
 | 
			
		||||
                            const int16_t *filter_x, int x_step_q4,
 | 
			
		||||
                            const int16_t *filter_y, int y_step_q4,
 | 
			
		||||
                            int w, int h) {
 | 
			
		||||
                            const int16_t *filter_y, int y_step_q4, int w,
 | 
			
		||||
                            int h) {
 | 
			
		||||
  int8_t cnt, filt_ver[8];
 | 
			
		||||
 | 
			
		||||
  assert(y_step_q4 == 16);
 | 
			
		||||
@@ -643,67 +643,55 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
 | 
			
		||||
  if (((const int32_t *)filter_y)[0] == 0) {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_vt_2t_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_vt_2t_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_vt_2t_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_vt_2t_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_vt_2t_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             &filt_ver[3], h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                             filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                             w, h);
 | 
			
		||||
        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                             x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    switch (w) {
 | 
			
		||||
      case 4:
 | 
			
		||||
        common_vt_8t_4w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 8:
 | 
			
		||||
        common_vt_8t_8w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                            dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                            filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 16:
 | 
			
		||||
        common_vt_8t_16w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 32:
 | 
			
		||||
        common_vt_8t_32w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      case 64:
 | 
			
		||||
        common_vt_8t_64w_msa(src, (int32_t)src_stride,
 | 
			
		||||
                             dst, (int32_t)dst_stride,
 | 
			
		||||
        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
 | 
			
		||||
                             filt_ver, h);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
 | 
			
		||||
                             filter_x, x_step_q4, filter_y, y_step_q4,
 | 
			
		||||
                             w, h);
 | 
			
		||||
        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
 | 
			
		||||
                             x_step_q4, filter_y, y_step_q4, w, h);
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -10,8 +10,8 @@
 | 
			
		||||
 | 
			
		||||
#include "vpx_dsp/mips/macros_msa.h"
 | 
			
		||||
 | 
			
		||||
static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                           uint8_t *dst, int32_t dst_stride, int32_t height) {
 | 
			
		||||
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                           int32_t dst_stride, int32_t height) {
 | 
			
		||||
  int32_t cnt;
 | 
			
		||||
  uint32_t out0, out1, out2, out3;
 | 
			
		||||
  v16u8 src0, src1, src2, src3;
 | 
			
		||||
@@ -24,8 +24,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
 | 
			
		||||
      LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
 | 
			
		||||
      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 | 
			
		||||
                  dst0, dst1, dst2, dst3);
 | 
			
		||||
      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
 | 
			
		||||
                  dst2, dst3);
 | 
			
		||||
 | 
			
		||||
      out0 = __msa_copy_u_w((v4i32)dst0, 0);
 | 
			
		||||
      out1 = __msa_copy_u_w((v4i32)dst1, 0);
 | 
			
		||||
@@ -53,8 +53,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
                           uint8_t *dst, int32_t dst_stride, int32_t height) {
 | 
			
		||||
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
 | 
			
		||||
                           int32_t dst_stride, int32_t height) {
 | 
			
		||||
  int32_t cnt;
 | 
			
		||||
  uint64_t out0, out1, out2, out3;
 | 
			
		||||
  v16u8 src0, src1, src2, src3;
 | 
			
		||||
@@ -65,8 +65,8 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    src += (4 * src_stride);
 | 
			
		||||
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 | 
			
		||||
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 | 
			
		||||
                dst0, dst1, dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
 | 
			
		||||
                dst2, dst3);
 | 
			
		||||
 | 
			
		||||
    out0 = __msa_copy_u_d((v2i64)dst0, 0);
 | 
			
		||||
    out1 = __msa_copy_u_d((v2i64)dst1, 0);
 | 
			
		||||
@@ -88,10 +88,10 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    src += (8 * src_stride);
 | 
			
		||||
    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 | 
			
		||||
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 | 
			
		||||
                dst0, dst1, dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 | 
			
		||||
                dst4, dst5, dst6, dst7);
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
 | 
			
		||||
                dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
 | 
			
		||||
                dst6, dst7);
 | 
			
		||||
    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
 | 
			
		||||
    dst += (8 * dst_stride);
 | 
			
		||||
  }
 | 
			
		||||
@@ -120,14 +120,14 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
 | 
			
		||||
    dst_dup += (4 * dst_stride);
 | 
			
		||||
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 | 
			
		||||
                dst0, dst1, dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 | 
			
		||||
                dst4, dst5, dst6, dst7);
 | 
			
		||||
    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
 | 
			
		||||
                dst8, dst9, dst10, dst11);
 | 
			
		||||
    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
 | 
			
		||||
                dst12, dst13, dst14, dst15);
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
 | 
			
		||||
                dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
 | 
			
		||||
                dst6, dst7);
 | 
			
		||||
    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
 | 
			
		||||
                dst10, dst11);
 | 
			
		||||
    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
 | 
			
		||||
                dst13, dst14, dst15);
 | 
			
		||||
 | 
			
		||||
    ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
 | 
			
		||||
    ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
 | 
			
		||||
@@ -166,14 +166,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
 | 
			
		||||
    dst_dup += dst_stride;
 | 
			
		||||
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
 | 
			
		||||
                dst0, dst1, dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
 | 
			
		||||
                dst4, dst5, dst6, dst7);
 | 
			
		||||
    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
 | 
			
		||||
                dst8, dst9, dst10, dst11);
 | 
			
		||||
    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
 | 
			
		||||
                dst12, dst13, dst14, dst15);
 | 
			
		||||
    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
 | 
			
		||||
                dst2, dst3);
 | 
			
		||||
    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
 | 
			
		||||
                dst6, dst7);
 | 
			
		||||
    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
 | 
			
		||||
                dst10, dst11);
 | 
			
		||||
    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
 | 
			
		||||
                dst13, dst14, dst15);
 | 
			
		||||
 | 
			
		||||
    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
 | 
			
		||||
    dst += dst_stride;
 | 
			
		||||
 
 | 
			
		||||
@@ -105,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
 | 
			
		||||
    dst_tmp = dst;
 | 
			
		||||
 | 
			
		||||
    for (loop_cnt = (height >> 3); loop_cnt--;) {
 | 
			
		||||
      LD_UB8(src_tmp, src_stride,
 | 
			
		||||
             src0, src1, src2, src3, src4, src5, src6, src7);
 | 
			
		||||
      LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
 | 
			
		||||
             src7);
 | 
			
		||||
      src_tmp += (8 * src_stride);
 | 
			
		||||
 | 
			
		||||
      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
 | 
			
		||||
             dst_tmp, dst_stride);
 | 
			
		||||
      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
 | 
			
		||||
             dst_stride);
 | 
			
		||||
      dst_tmp += (8 * dst_stride);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -16,8 +16,9 @@
 | 
			
		||||
 | 
			
		||||
extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
 | 
			
		||||
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,         \
 | 
			
		||||
                            filt0, filt1, filt2, filt3) ({  \
 | 
			
		||||
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
 | 
			
		||||
                            filt3)                                       \
 | 
			
		||||
  ({                                                                     \
 | 
			
		||||
    v8i16 tmp0, tmp1;                                                    \
 | 
			
		||||
                                                                         \
 | 
			
		||||
    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);                    \
 | 
			
		||||
@@ -29,15 +30,16 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
    tmp0;                                                                \
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,        \
 | 
			
		||||
                        filt_h0, filt_h1, filt_h2, filt_h3) ({         \
 | 
			
		||||
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
 | 
			
		||||
                        filt_h1, filt_h2, filt_h3)                             \
 | 
			
		||||
  ({                                                                           \
 | 
			
		||||
    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
 | 
			
		||||
    v8i16 hz_out_m;                                                            \
 | 
			
		||||
                                                                               \
 | 
			
		||||
  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
 | 
			
		||||
             vec0_m, vec1_m, vec2_m, vec3_m);                          \
 | 
			
		||||
  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
 | 
			
		||||
                                 filt_h0, filt_h1, filt_h2, filt_h3);  \
 | 
			
		||||
    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
 | 
			
		||||
               vec3_m);                                                        \
 | 
			
		||||
    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0,    \
 | 
			
		||||
                                   filt_h1, filt_h2, filt_h3);                 \
 | 
			
		||||
                                                                               \
 | 
			
		||||
    hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                           \
 | 
			
		||||
    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                     \
 | 
			
		||||
@@ -45,10 +47,10 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
    hz_out_m;                                                                  \
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,               \
 | 
			
		||||
                                   mask0, mask1, mask2, mask3,           \
 | 
			
		||||
                                   filt0, filt1, filt2, filt3,           \
 | 
			
		||||
                                   out0, out1) {                         \
 | 
			
		||||
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
 | 
			
		||||
                                   mask2, mask3, filt0, filt1, filt2, filt3, \
 | 
			
		||||
                                   out0, out1)                               \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
 | 
			
		||||
    v8i16 res0_m, res1_m, res2_m, res3_m;                                    \
 | 
			
		||||
                                                                             \
 | 
			
		||||
@@ -63,10 +65,10 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                 \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                  \
 | 
			
		||||
                                   mask0, mask1, mask2, mask3,              \
 | 
			
		||||
                                   filt0, filt1, filt2, filt3,              \
 | 
			
		||||
                                   out0, out1, out2, out3) {                \
 | 
			
		||||
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
 | 
			
		||||
                                   mask2, mask3, filt0, filt1, filt2, filt3, \
 | 
			
		||||
                                   out0, out1, out2, out3)                   \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
 | 
			
		||||
    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;    \
 | 
			
		||||
                                                                             \
 | 
			
		||||
@@ -90,7 +92,8 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
                res7_m, out0, out1, out2, out3);                             \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) {  \
 | 
			
		||||
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
 | 
			
		||||
  {                                                  \
 | 
			
		||||
    v16u8 tmp_m;                                     \
 | 
			
		||||
                                                     \
 | 
			
		||||
    tmp_m = PCKEV_XORI128_UB(in1, in0);              \
 | 
			
		||||
@@ -98,7 +101,8 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
    ST_UB(tmp_m, (pdst));                            \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) {           \
 | 
			
		||||
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)              \
 | 
			
		||||
  {                                                       \
 | 
			
		||||
    v16u8 tmp_m;                                          \
 | 
			
		||||
                                                          \
 | 
			
		||||
    tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
 | 
			
		||||
@@ -106,8 +110,9 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
 | 
			
		||||
    ST_UB(tmp_m, (pdst));                                 \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
 | 
			
		||||
                           pdst, stride) {                              \
 | 
			
		||||
#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
 | 
			
		||||
                           stride)                                           \
 | 
			
		||||
  {                                                                          \
 | 
			
		||||
    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
 | 
			
		||||
    uint8_t *pdst_m = (uint8_t *)(pdst);                                     \
 | 
			
		||||
                                                                             \
 | 
			
		||||
 
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user