vpx_dsp: apply clang-format

Change-Id: I3ea3e77364879928bd916f2b0a7838073ade5975
This commit is contained in:
clang-format 2016-07-22 20:07:03 -07:00 committed by James Zern
parent 82070ae939
commit 099bd7f07e
146 changed files with 20984 additions and 22178 deletions

View File

@ -48,7 +48,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
// set up a 256 entry lookup that matches gaussian distribution // set up a 256 entry lookup that matches gaussian distribution
for (i = -32; i < 32; ++i) { for (i = -32; i < 32; ++i) {
const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i)); const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
if (a_i) { if (a_i) {
for (j = 0; j < a_i; ++j) { for (j = 0; j < a_i; ++j) {
char_dist[next + j] = (char)i; char_dist[next + j] = (char)i;

View File

@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
} }
} }
void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
const uint8_t *b, int b_stride, int b_stride, int *min, int *max) {
int *min, int *max) {
// Load and concatenate. // Load and concatenate.
const uint8x16_t a01 = vcombine_u8(vld1_u8(a), const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
vld1_u8(a + a_stride)); const uint8x16_t a23 =
const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride), vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
vld1_u8(a + 3 * a_stride)); const uint8x16_t a45 =
const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride), vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
vld1_u8(a + 5 * a_stride)); const uint8x16_t a67 =
const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride), vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
vld1_u8(a + 7 * a_stride));
const uint8x16_t b01 = vcombine_u8(vld1_u8(b), const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
vld1_u8(b + b_stride)); const uint8x16_t b23 =
const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride), vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
vld1_u8(b + 3 * b_stride)); const uint8x16_t b45 =
const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride), vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
vld1_u8(b + 5 * b_stride)); const uint8x16_t b67 =
const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride), vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
vld1_u8(b + 7 * b_stride));
// Absolute difference. // Absolute difference.
const uint8x16_t ab01_diff = vabdq_u8(a01, b01); const uint8x16_t ab01_diff = vabdq_u8(a01, b01);

View File

@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
// 14 15 16 17 54 55 56 57 // 14 15 16 17 54 55 56 57
// 24 25 26 27 64 65 66 67 // 24 25 26 27 64 65 66 67
// 34 35 36 37 74 75 76 77 // 34 35 36 37 74 75 76 77
const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), const int32x4x2_t r02_s32 =
vreinterpretq_s32_s16(out_2)); vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), const int32x4x2_t r13_s32 =
vreinterpretq_s32_s16(out_3)); vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), const int32x4x2_t r46_s32 =
vreinterpretq_s32_s16(out_6)); vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), const int32x4x2_t r57_s32 =
vreinterpretq_s32_s16(out_7)); vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
const int16x8x2_t r01_s16 = const int16x8x2_t r01_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
vreinterpretq_s16_s32(r13_s32.val[0])); vreinterpretq_s16_s32(r13_s32.val[0]));

View File

@ -12,9 +12,8 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a2, int16x8_t *a3, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) { int16x8_t *a6, int16x8_t *a7) {
const int16x8_t b0 = vaddq_s16(*a0, *a1); const int16x8_t b0 = vaddq_s16(*a0, *a1);
const int16x8_t b1 = vsubq_s16(*a0, *a1); const int16x8_t b1 = vsubq_s16(*a0, *a1);
@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider // TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
// reversing transpose order which may make it easier for the compiler to // reversing transpose order which may make it easier for the compiler to
// reconcile the vtrn.64 moves. // reconcile the vtrn.64 moves.
static void transpose8x8(int16x8_t *a0, int16x8_t *a1, static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a2, int16x8_t *a3, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) { int16x8_t *a6, int16x8_t *a7) {
// Swap 64 bit elements. Goes from: // Swap 64 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07 // a0: 00 01 02 03 04 05 06 07
@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
// a1657_hi: // a1657_hi:
// 12 13 28 29 44 45 60 61 // 12 13 28 29 44 45 60 61
// 14 15 30 31 46 47 62 63 // 14 15 30 31 46 47 62 63
const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo), const int32x4x2_t a0246_lo =
vreinterpretq_s32_s16(a26_lo)); vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo), const int32x4x2_t a1357_lo =
vreinterpretq_s32_s16(a37_lo)); vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi), const int32x4x2_t a0246_hi =
vreinterpretq_s32_s16(a26_hi)); vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi), const int32x4x2_t a1357_hi =
vreinterpretq_s32_s16(a37_hi)); vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
// Swap 16 bit elements resulting in: // Swap 16 bit elements resulting in:
// b0: // b0:

View File

@ -13,10 +13,7 @@
#include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
void vpx_idct16x16_1_add_neon( void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8; uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64; uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;

View File

@ -13,15 +13,10 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/txfm_common.h"
static INLINE void TRANSPOSE8X8( static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
int16x8_t *q8s16, int16x8_t *q10s16, int16x8_t *q11s16,
int16x8_t *q9s16, int16x8_t *q12s16, int16x8_t *q13s16,
int16x8_t *q10s16, int16x8_t *q14s16, int16x8_t *q15s16) {
int16x8_t *q11s16,
int16x8_t *q12s16,
int16x8_t *q13s16,
int16x8_t *q14s16,
int16x8_t *q15s16) {
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
@ -53,14 +48,14 @@ static INLINE void TRANSPOSE8X8(
*q14s16 = vcombine_s16(d21s16, d29s16); *q14s16 = vcombine_s16(d21s16, d29s16);
*q15s16 = vcombine_s16(d23s16, d31s16); *q15s16 = vcombine_s16(d23s16, d31s16);
q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), q0x2s32 =
vreinterpretq_s32_s16(*q10s16)); vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), q1x2s32 =
vreinterpretq_s32_s16(*q11s16)); vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), q2x2s32 =
vreinterpretq_s32_s16(*q14s16)); vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), q3x2s32 =
vreinterpretq_s32_s16(*q15s16)); vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
@ -82,9 +77,7 @@ static INLINE void TRANSPOSE8X8(
return; return;
} }
void vpx_idct16x16_256_add_neon_pass1( void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
int16_t *in,
int16_t *out,
int output_stride) { int output_stride) {
int16x4_t d0s16, d1s16, d2s16, d3s16; int16x4_t d0s16, d1s16, d2s16, d3s16;
int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
@ -122,8 +115,8 @@ void vpx_idct16x16_256_add_neon_pass1(
q0x2s16 = vld2q_s16(in); q0x2s16 = vld2q_s16(in);
q15s16 = q0x2s16.val[0]; q15s16 = q0x2s16.val[0];
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
d16s16 = vget_low_s16(q8s16); d16s16 = vget_low_s16(q8s16);
d17s16 = vget_high_s16(q8s16); d17s16 = vget_high_s16(q8s16);
@ -320,13 +313,9 @@ void vpx_idct16x16_256_add_neon_pass1(
return; return;
} }
void vpx_idct16x16_256_add_neon_pass2( void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
int16_t *src, int16_t *pass1Output, int16_t skip_adding,
int16_t *out, uint8_t *dest, int dest_stride) {
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride) {
uint8_t *d; uint8_t *d;
uint8x8_t d12u8, d13u8; uint8x8_t d12u8, d13u8;
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
@ -367,8 +356,8 @@ void vpx_idct16x16_256_add_neon_pass2(
q0x2s16 = vld2q_s16(src); q0x2s16 = vld2q_s16(src);
q15s16 = q0x2s16.val[0]; q15s16 = q0x2s16.val[0];
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
d16s16 = vget_low_s16(q8s16); d16s16 = vget_low_s16(q8s16);
d17s16 = vget_high_s16(q8s16); d17s16 = vget_high_s16(q8s16);
@ -602,10 +591,10 @@ void vpx_idct16x16_256_add_neon_pass2(
q13s16 = vaddq_s16(q1s16, q14s16); q13s16 = vaddq_s16(q1s16, q14s16);
q12s16 = vrshrq_n_s16(q12s16, 6); q12s16 = vrshrq_n_s16(q12s16, 6);
q13s16 = vrshrq_n_s16(q13s16, 6); q13s16 = vrshrq_n_s16(q13s16, 6);
q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), q12u16 =
vreinterpret_u8_s64(d12s64)); vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), q13u16 =
vreinterpret_u8_s64(d13s64)); vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
@ -627,10 +616,10 @@ void vpx_idct16x16_256_add_neon_pass2(
q13s16 = vaddq_s16(q11s16, q4s16); q13s16 = vaddq_s16(q11s16, q4s16);
q12s16 = vrshrq_n_s16(q12s16, 6); q12s16 = vrshrq_n_s16(q12s16, 6);
q13s16 = vrshrq_n_s16(q13s16, 6); q13s16 = vrshrq_n_s16(q13s16, 6);
q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), q12u16 =
vreinterpret_u8_s64(d12s64)); vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), q13u16 =
vreinterpret_u8_s64(d13s64)); vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
@ -652,10 +641,10 @@ void vpx_idct16x16_256_add_neon_pass2(
q13s16 = vaddq_s16(q1s16, q2s16); q13s16 = vaddq_s16(q1s16, q2s16);
q12s16 = vrshrq_n_s16(q12s16, 6); q12s16 = vrshrq_n_s16(q12s16, 6);
q13s16 = vrshrq_n_s16(q13s16, 6); q13s16 = vrshrq_n_s16(q13s16, 6);
q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), q12u16 =
vreinterpret_u8_s64(d12s64)); vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), q13u16 =
vreinterpret_u8_s64(d13s64)); vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
@ -676,10 +665,10 @@ void vpx_idct16x16_256_add_neon_pass2(
q13s16 = vaddq_s16(q11s16, q8s16); q13s16 = vaddq_s16(q11s16, q8s16);
q12s16 = vrshrq_n_s16(q12s16, 6); q12s16 = vrshrq_n_s16(q12s16, 6);
q13s16 = vrshrq_n_s16(q13s16, 6); q13s16 = vrshrq_n_s16(q13s16, 6);
q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), q12u16 =
vreinterpret_u8_s64(d12s64)); vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), q13u16 =
vreinterpret_u8_s64(d13s64)); vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
@ -693,8 +682,7 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q8s16 = vrshrq_n_s16(q8s16, 6); q8s16 = vrshrq_n_s16(q8s16, 6);
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
@ -702,8 +690,7 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q9s16 = vrshrq_n_s16(q9s16, 6); q9s16 = vrshrq_n_s16(q9s16, 6);
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
@ -711,8 +698,7 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q2s16 = vrshrq_n_s16(q2s16, 6); q2s16 = vrshrq_n_s16(q2s16, 6);
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
@ -720,8 +706,7 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q3s16 = vrshrq_n_s16(q3s16, 6); q3s16 = vrshrq_n_s16(q3s16, 6);
q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
@ -729,8 +714,7 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q4s16 = vrshrq_n_s16(q4s16, 6); q4s16 = vrshrq_n_s16(q4s16, 6);
q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
@ -738,8 +722,7 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q5s16 = vrshrq_n_s16(q5s16, 6); q5s16 = vrshrq_n_s16(q5s16, 6);
q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
@ -747,16 +730,16 @@ void vpx_idct16x16_256_add_neon_pass2(
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
dest += dest_stride; dest += dest_stride;
q14s16 = vrshrq_n_s16(q14s16, 6); q14s16 = vrshrq_n_s16(q14s16, 6);
q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16), q14u16 =
vreinterpret_u8_s64(d12s64)); vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
d += dest_stride; d += dest_stride;
d12s64 = vld1_s64((int64_t *)dest); d12s64 = vld1_s64((int64_t *)dest);
q15s16 = vrshrq_n_s16(q15s16, 6); q15s16 = vrshrq_n_s16(q15s16, 6);
q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16), q15u16 =
vreinterpret_u8_s64(d12s64)); vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
} else { // skip_adding_dest } else { // skip_adding_dest
@ -879,9 +862,7 @@ void vpx_idct16x16_256_add_neon_pass2(
return; return;
} }
void vpx_idct16x16_10_add_neon_pass1( void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
int16_t *in,
int16_t *out,
int output_stride) { int output_stride) {
int16x4_t d4s16; int16x4_t d4s16;
int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
@ -917,8 +898,8 @@ void vpx_idct16x16_10_add_neon_pass1(
q0x2s16 = vld2q_s16(in); q0x2s16 = vld2q_s16(in);
q15s16 = q0x2s16.val[0]; q15s16 = q0x2s16.val[0];
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
// stage 3 // stage 3
q0s16 = vdupq_n_s16(cospi_28_64 * 2); q0s16 = vdupq_n_s16(cospi_28_64 * 2);
@ -1017,13 +998,9 @@ void vpx_idct16x16_10_add_neon_pass1(
return; return;
} }
void vpx_idct16x16_10_add_neon_pass2( void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
int16_t *src, int16_t *pass1Output, int16_t skip_adding,
int16_t *out, uint8_t *dest, int dest_stride) {
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride) {
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
int16x4_t d20s16, d21s16, d22s16, d23s16; int16x4_t d20s16, d21s16, d22s16, d23s16;
@ -1064,8 +1041,8 @@ void vpx_idct16x16_10_add_neon_pass2(
q0x2s16 = vld2q_s16(src); q0x2s16 = vld2q_s16(src);
q15s16 = q0x2s16.val[0]; q15s16 = q0x2s16.val[0];
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
// stage 3 // stage 3
q6s16 = vdupq_n_s16(cospi_30_64 * 2); q6s16 = vdupq_n_s16(cospi_30_64 * 2);

View File

@ -10,24 +10,16 @@
#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_dsp_common.h"
void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
int16_t *output,
int output_stride); int output_stride);
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *output, int16_t *pass1Output, int16_t skip_adding,
int16_t *pass1Output, uint8_t *dest, int dest_stride);
int16_t skip_adding, void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
uint8_t *dest,
int dest_stride);
void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
int16_t *output,
int output_stride); int output_stride);
void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *output, int16_t *pass1Output, int16_t skip_adding,
int16_t *pass1Output, uint8_t *dest, int dest_stride);
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
@ -35,13 +27,13 @@ extern void vpx_push_neon(int64_t *store);
extern void vpx_pop_neon(int64_t *store); extern void vpx_pop_neon(int64_t *store);
#endif // HAVE_NEON_ASM #endif // HAVE_NEON_ASM
void vpx_idct16x16_256_add_neon(const int16_t *input, void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
uint8_t *dest, int dest_stride) { int dest_stride) {
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
int64_t store_reg[8]; int64_t store_reg[8];
#endif #endif
int16_t pass1_output[16*16] = {0}; int16_t pass1_output[16 * 16] = { 0 };
int16_t row_idct_output[16*16] = {0}; int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
// save d8-d15 register values. // save d8-d15 register values.
@ -56,27 +48,19 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7 // with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output. // which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2(input+1, vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
row_idct_output, dest, dest_stride);
pass1_output,
0,
dest,
dest_stride);
/* Parallel idct on the lower 8 rows */ /* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output. // stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7 // with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output. // which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2(input+8*16+1, vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
row_idct_output+8, pass1_output, 0, dest, dest_stride);
pass1_output,
0,
dest,
dest_stride);
/* Parallel idct on the left 8 columns */ /* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@ -86,27 +70,20 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7. // with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data. // Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
row_idct_output, pass1_output, 1, dest, dest_stride);
pass1_output,
1,
dest,
dest_stride);
/* Parallel idct on the right 8 columns */ /* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output. // stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7. // with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data. // Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output+8, row_idct_output + 8, pass1_output, 1,
pass1_output, dest + 8, dest_stride);
1,
dest+8,
dest_stride);
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
// restore d8-d15 register values. // restore d8-d15 register values.
@ -116,13 +93,13 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
return; return;
} }
void vpx_idct16x16_10_add_neon(const int16_t *input, void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
uint8_t *dest, int dest_stride) { int dest_stride) {
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
int64_t store_reg[8]; int64_t store_reg[8];
#endif #endif
int16_t pass1_output[16*16] = {0}; int16_t pass1_output[16 * 16] = { 0 };
int16_t row_idct_output[16*16] = {0}; int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
// save d8-d15 register values. // save d8-d15 register values.
@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7 // with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output. // which will be saved into row_idct_output.
vpx_idct16x16_10_add_neon_pass2(input+1, vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
row_idct_output, dest, dest_stride);
pass1_output,
0,
dest,
dest_stride);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */ /* Skip Parallel idct on the lower 8 rows as they are all 0s */
@ -154,27 +127,20 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7. // with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data. // Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
row_idct_output, pass1_output, 1, dest, dest_stride);
pass1_output,
1,
dest,
dest_stride);
/* Parallel idct on the right 8 columns */ /* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output. // stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7. // with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data. // Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output+8, row_idct_output + 8, pass1_output, 1,
pass1_output, dest + 8, dest_stride);
1,
dest+8,
dest_stride);
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
// restore d8-d15 register values. // restore d8-d15 register values.

View File

@ -15,16 +15,10 @@
#include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
static INLINE void LD_16x8( static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8_t *d, uint8x16_t *q9u8, uint8x16_t *q10u8,
int d_stride, uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q8u8, uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q9u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) { uint8x16_t *q15u8) {
*q8u8 = vld1q_u8(d); *q8u8 = vld1q_u8(d);
d += d_stride; d += d_stride;
@ -44,15 +38,10 @@ static INLINE void LD_16x8(
return; return;
} }
static INLINE void ADD_DIFF_16x8( static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t qdiffu8, uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q8u8, uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q9u8, uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) { uint8x16_t *q15u8) {
*q8u8 = vqaddq_u8(*q8u8, qdiffu8); *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
*q9u8 = vqaddq_u8(*q9u8, qdiffu8); *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
@ -65,15 +54,10 @@ static INLINE void ADD_DIFF_16x8(
return; return;
} }
static INLINE void SUB_DIFF_16x8( static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t qdiffu8, uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q8u8, uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q9u8, uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) { uint8x16_t *q15u8) {
*q8u8 = vqsubq_u8(*q8u8, qdiffu8); *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
*q9u8 = vqsubq_u8(*q9u8, qdiffu8); *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
@ -86,16 +70,10 @@ static INLINE void SUB_DIFF_16x8(
return; return;
} }
static INLINE void ST_16x8( static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8_t *d, uint8x16_t *q9u8, uint8x16_t *q10u8,
int d_stride, uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q8u8, uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q9u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) { uint8x16_t *q15u8) {
vst1q_u8(d, *q8u8); vst1q_u8(d, *q8u8);
d += d_stride; d += d_stride;
@ -115,10 +93,7 @@ static INLINE void ST_16x8(
return; return;
} }
void vpx_idct32x32_1_add_neon( void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int i, j, dest_stride8; int i, j, dest_stride8;
uint8_t *d; uint8_t *d;
@ -135,12 +110,12 @@ void vpx_idct32x32_1_add_neon(
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest; d = dest;
for (j = 0; j < 4; j++) { for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q12u8, &q13u8, &q14u8, &q15u8); &q14u8, &q15u8);
ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q12u8, &q13u8, &q14u8, &q15u8); &q14u8, &q15u8);
ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q12u8, &q13u8, &q14u8, &q15u8); &q14u8, &q15u8);
d += dest_stride8; d += dest_stride8;
} }
} }
@ -151,12 +126,12 @@ void vpx_idct32x32_1_add_neon(
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
d = dest; d = dest;
for (j = 0; j < 4; j++) { for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q12u8, &q13u8, &q14u8, &q15u8); &q14u8, &q15u8);
SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q12u8, &q13u8, &q14u8, &q15u8); &q14u8, &q15u8);
ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q12u8, &q13u8, &q14u8, &q15u8); &q14u8, &q15u8);
d += dest_stride8; d += dest_stride8;
} }
} }

View File

@ -26,13 +26,9 @@
vst1q_s16(out + second * 32, qB); vst1q_s16(out + second * 32, qB);
#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
__STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
q6s16, q7s16, q8s16, q9s16); static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
static INLINE void __STORE_COMBINE_CENTER_RESULTS( int stride, int16x8_t q6s16,
uint8_t *p1,
uint8_t *p2,
int stride,
int16x8_t q6s16,
int16x8_t q7s16, int16x8_t q7s16,
int16x8_t q8s16, int16x8_t q8s16,
int16x8_t q9s16) { int16x8_t q9s16) {
@ -50,14 +46,14 @@ static INLINE void __STORE_COMBINE_CENTER_RESULTS(
q9s16 = vrshrq_n_s16(q9s16, 6); q9s16 = vrshrq_n_s16(q9s16, 6);
q6s16 = vrshrq_n_s16(q6s16, 6); q6s16 = vrshrq_n_s16(q6s16, 6);
q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), q7s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d9s16))); vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), q8s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d10s16))); vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), q9s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d11s16))); vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), q6s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d8s16))); vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
@ -73,14 +69,10 @@ static INLINE void __STORE_COMBINE_CENTER_RESULTS(
return; return;
} }
#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ #define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
__STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
q4s16, q5s16, q6s16, q7s16); static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
static INLINE void __STORE_COMBINE_EXTREME_RESULTS( int stride, int16x8_t q4s16,
uint8_t *p1,
uint8_t *p2,
int stride,
int16x8_t q4s16,
int16x8_t q5s16, int16x8_t q5s16,
int16x8_t q6s16, int16x8_t q6s16,
int16x8_t q7s16) { int16x8_t q7s16) {
@ -98,14 +90,14 @@ static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
q7s16 = vrshrq_n_s16(q7s16, 6); q7s16 = vrshrq_n_s16(q7s16, 6);
q4s16 = vrshrq_n_s16(q4s16, 6); q4s16 = vrshrq_n_s16(q4s16, 6);
q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), q5s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d5s16))); vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), q6s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d6s16))); vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), q7s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d7s16))); vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), q4s16 = vreinterpretq_s16_u16(
vreinterpret_u8_s16(d4s16))); vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
@ -123,13 +115,9 @@ static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
static INLINE void DO_BUTTERFLY( static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
int16x8_t q14s16, int16_t first_const, int16_t second_const,
int16x8_t q13s16, int16x8_t *qAs16, int16x8_t *qBs16) {
int16_t first_const,
int16_t second_const,
int16x8_t *qAs16,
int16x8_t *qBs16) {
int16x4_t d30s16, d31s16; int16x4_t d30s16, d31s16;
int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
int16x4_t dCs16, dDs16, dAs16, dBs16; int16x4_t dCs16, dDs16, dAs16, dBs16;
@ -158,16 +146,12 @@ static INLINE void DO_BUTTERFLY(
q11s32 = vaddq_s32(q12s32, q11s32); q11s32 = vaddq_s32(q12s32, q11s32);
q10s32 = vaddq_s32(q10s32, q15s32); q10s32 = vaddq_s32(q10s32, q15s32);
*qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
vqrshrn_n_s32(q9s32, 14)); *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
*qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
vqrshrn_n_s32(q10s32, 14));
return; return;
} }
static INLINE void idct32_transpose_pair( static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
int16_t *input,
int16_t *t_buf) {
int16_t *in; int16_t *in;
int i; int i;
const int stride = 32; const int stride = 32;
@ -221,14 +205,14 @@ static INLINE void idct32_transpose_pair(
q14s16 = vcombine_s16(d21s16, d29s16); q14s16 = vcombine_s16(d21s16, d29s16);
q15s16 = vcombine_s16(d23s16, d31s16); q15s16 = vcombine_s16(d23s16, d31s16);
q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), q0x2s32 =
vreinterpretq_s32_s16(q10s16)); vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), q1x2s32 =
vreinterpretq_s32_s16(q11s16)); vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), q2x2s32 =
vreinterpretq_s32_s16(q14s16)); vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), q3x2s32 =
vreinterpretq_s32_s16(q15s16)); vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
@ -259,19 +243,12 @@ static INLINE void idct32_transpose_pair(
return; return;
} }
static INLINE void idct32_bands_end_1st_pass( static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
int16_t *out, int16x8_t q3s16, int16x8_t q6s16,
int16x8_t q2s16, int16x8_t q7s16, int16x8_t q8s16,
int16x8_t q3s16, int16x8_t q9s16, int16x8_t q10s16,
int16x8_t q6s16, int16x8_t q11s16, int16x8_t q12s16,
int16x8_t q7s16, int16x8_t q13s16, int16x8_t q14s16,
int16x8_t q8s16,
int16x8_t q9s16,
int16x8_t q10s16,
int16x8_t q11s16,
int16x8_t q12s16,
int16x8_t q13s16,
int16x8_t q14s16,
int16x8_t q15s16) { int16x8_t q15s16) {
int16x8_t q0s16, q1s16, q4s16, q5s16; int16x8_t q0s16, q1s16, q4s16, q5s16;
@ -355,30 +332,20 @@ static INLINE void idct32_bands_end_1st_pass(
} }
static INLINE void idct32_bands_end_2nd_pass( static INLINE void idct32_bands_end_2nd_pass(
int16_t *out, int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
uint8_t *dest, int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
int stride, int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
int16x8_t q2s16, int16x8_t q14s16, int16x8_t q15s16) {
int16x8_t q3s16,
int16x8_t q6s16,
int16x8_t q7s16,
int16x8_t q8s16,
int16x8_t q9s16,
int16x8_t q10s16,
int16x8_t q11s16,
int16x8_t q12s16,
int16x8_t q13s16,
int16x8_t q14s16,
int16x8_t q15s16) {
uint8_t *r6 = dest + 31 * stride; uint8_t *r6 = dest + 31 * stride;
uint8_t *r7 = dest/* + 0 * stride*/; uint8_t *r7 = dest /* + 0 * stride*/;
uint8_t *r9 = dest + 15 * stride; uint8_t *r9 = dest + 15 * stride;
uint8_t *r10 = dest + 16 * stride; uint8_t *r10 = dest + 16 * stride;
int str2 = stride << 1; int str2 = stride << 1;
int16x8_t q0s16, q1s16, q4s16, q5s16; int16x8_t q0s16, q1s16, q4s16, q5s16;
STORE_COMBINE_CENTER_RESULTS(r10, r9); STORE_COMBINE_CENTER_RESULTS(r10, r9);
r10 += str2; r9 -= str2; r10 += str2;
r9 -= str2;
LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16); q4s16 = vaddq_s16(q2s16, q1s16);
@ -386,7 +353,8 @@ static INLINE void idct32_bands_end_2nd_pass(
q6s16 = vsubq_s16(q3s16, q0s16); q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16); q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6); STORE_COMBINE_EXTREME_RESULTS(r7, r6);
r7 += str2; r6 -= str2; r7 += str2;
r6 -= str2;
LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
q2s16 = vaddq_s16(q10s16, q1s16); q2s16 = vaddq_s16(q10s16, q1s16);
@ -400,7 +368,8 @@ static INLINE void idct32_bands_end_2nd_pass(
q6s16 = vsubq_s16(q5s16, q0s16); q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16); q7s16 = vsubq_s16(q4s16, q1s16);
STORE_COMBINE_CENTER_RESULTS(r10, r9); STORE_COMBINE_CENTER_RESULTS(r10, r9);
r10 += str2; r9 -= str2; r10 += str2;
r9 -= str2;
LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16); q4s16 = vaddq_s16(q2s16, q1s16);
@ -408,7 +377,8 @@ static INLINE void idct32_bands_end_2nd_pass(
q6s16 = vsubq_s16(q3s16, q0s16); q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16); q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6); STORE_COMBINE_EXTREME_RESULTS(r7, r6);
r7 += str2; r6 -= str2; r7 += str2;
r6 -= str2;
LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
q2s16 = vaddq_s16(q12s16, q1s16); q2s16 = vaddq_s16(q12s16, q1s16);
@ -422,7 +392,8 @@ static INLINE void idct32_bands_end_2nd_pass(
q6s16 = vsubq_s16(q5s16, q0s16); q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16); q7s16 = vsubq_s16(q4s16, q1s16);
STORE_COMBINE_CENTER_RESULTS(r10, r9); STORE_COMBINE_CENTER_RESULTS(r10, r9);
r10 += str2; r9 -= str2; r10 += str2;
r9 -= str2;
LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16); q4s16 = vaddq_s16(q2s16, q1s16);
@ -430,7 +401,8 @@ static INLINE void idct32_bands_end_2nd_pass(
q6s16 = vsubq_s16(q3s16, q0s16); q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16); q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6); STORE_COMBINE_EXTREME_RESULTS(r7, r6);
r7 += str2; r6 -= str2; r7 += str2;
r6 -= str2;
LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
q2s16 = vaddq_s16(q14s16, q1s16); q2s16 = vaddq_s16(q14s16, q1s16);
@ -454,10 +426,7 @@ static INLINE void idct32_bands_end_2nd_pass(
return; return;
} }
void vpx_idct32x32_1024_add_neon( void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
int16_t *input,
uint8_t *dest,
int stride) {
int i, idct32_pass_loop; int i, idct32_pass_loop;
int16_t trans_buf[32 * 8]; int16_t trans_buf[32 * 8];
int16_t pass1[32 * 32]; int16_t pass1[32 * 32];
@ -466,14 +435,11 @@ void vpx_idct32x32_1024_add_neon(
int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
for (idct32_pass_loop = 0, out = pass1; for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop < 2;
idct32_pass_loop++, idct32_pass_loop++,
input = pass1, // the input of pass2 is the result of pass1 input = pass1, // the input of pass2 is the result of pass1
out = pass2) { out = pass2) {
for (i = 0; for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
i < 4; i++,
input += 32 * 8, out += 8) { // idct32_bands_loop
idct32_transpose_pair(input, trans_buf); idct32_transpose_pair(input, trans_buf);
// ----------------------------------------- // -----------------------------------------
@ -603,8 +569,7 @@ void vpx_idct32x32_1024_add_neon(
// part of stage 7 // part of stage 7
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
&q1s16, &q0s16);
STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
// ----------------------------------------- // -----------------------------------------
@ -704,13 +669,13 @@ void vpx_idct32x32_1024_add_neon(
q7s16 = vsubq_s16(q4s16, q1s16); q7s16 = vsubq_s16(q4s16, q1s16);
if (idct32_pass_loop == 0) { if (idct32_pass_loop == 0) {
idct32_bands_end_1st_pass(out, idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16,
q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); q15s16);
} else { } else {
idct32_bands_end_2nd_pass(out, dest, stride, idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); q14s16, q15s16);
dest += 8; dest += 8;
} }
} }

View File

@ -13,10 +13,7 @@
#include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
void vpx_idct4x4_1_add_neon( void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x8_t d6u8; uint8x8_t d6u8;
uint32x2_t d2u32 = vdup_n_u32(0); uint32x2_t d2u32 = vdup_n_u32(0);
uint16x8_t q8u16; uint16x8_t q8u16;
@ -37,8 +34,7 @@ void vpx_idct4x4_1_add_neon(
d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
d1 += dest_stride; d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
vreinterpret_u8_u32(d2u32));
d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);

View File

@ -10,10 +10,7 @@
#include <arm_neon.h> #include <arm_neon.h>
void vpx_idct4x4_16_add_neon( void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x8_t d26u8, d27u8; uint8x8_t d26u8, d27u8;
uint32x2_t d26u32, d27u32; uint32x2_t d26u32, d27u32;
uint16x8_t q8u16, q9u16; uint16x8_t q8u16, q9u16;
@ -46,8 +43,8 @@ void vpx_idct4x4_16_add_neon(
d20s16 = vdup_n_s16(cospi_8_64); d20s16 = vdup_n_s16(cospi_8_64);
d21s16 = vdup_n_s16(cospi_16_64); d21s16 = vdup_n_s16(cospi_16_64);
q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), q0x2s32 =
vreinterpretq_s32_s16(q9s16)); vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
@ -88,8 +85,8 @@ void vpx_idct4x4_16_add_neon(
q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), q0x2s32 =
vreinterpretq_s32_s16(q9s16)); vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
@ -131,10 +128,8 @@ void vpx_idct4x4_16_add_neon(
d += dest_stride; d += dest_stride;
d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
vreinterpret_u8_u32(d27u32));
d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

View File

@ -13,10 +13,7 @@
#include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
void vpx_idct8x8_1_add_neon( void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8; uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64; uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;

View File

@ -13,15 +13,10 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/txfm_common.h"
static INLINE void TRANSPOSE8X8( static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
int16x8_t *q8s16, int16x8_t *q10s16, int16x8_t *q11s16,
int16x8_t *q9s16, int16x8_t *q12s16, int16x8_t *q13s16,
int16x8_t *q10s16, int16x8_t *q14s16, int16x8_t *q15s16) {
int16x8_t *q11s16,
int16x8_t *q12s16,
int16x8_t *q13s16,
int16x8_t *q14s16,
int16x8_t *q15s16) {
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
@ -53,14 +48,14 @@ static INLINE void TRANSPOSE8X8(
*q14s16 = vcombine_s16(d21s16, d29s16); *q14s16 = vcombine_s16(d21s16, d29s16);
*q15s16 = vcombine_s16(d23s16, d31s16); *q15s16 = vcombine_s16(d23s16, d31s16);
q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), q0x2s32 =
vreinterpretq_s32_s16(*q10s16)); vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), q1x2s32 =
vreinterpretq_s32_s16(*q11s16)); vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), q2x2s32 =
vreinterpretq_s32_s16(*q14s16)); vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), q3x2s32 =
vreinterpretq_s32_s16(*q15s16)); vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
@ -82,15 +77,10 @@ static INLINE void TRANSPOSE8X8(
return; return;
} }
static INLINE void IDCT8x8_1D( static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
int16x8_t *q8s16, int16x8_t *q10s16, int16x8_t *q11s16,
int16x8_t *q9s16, int16x8_t *q12s16, int16x8_t *q13s16,
int16x8_t *q10s16, int16x8_t *q14s16, int16x8_t *q15s16) {
int16x8_t *q11s16,
int16x8_t *q12s16,
int16x8_t *q13s16,
int16x8_t *q14s16,
int16x8_t *q15s16) {
int16x4_t d0s16, d1s16, d2s16, d3s16; int16x4_t d0s16, d1s16, d2s16, d3s16;
int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
@ -238,10 +228,7 @@ static INLINE void IDCT8x8_1D(
return; return;
} }
void vpx_idct8x8_64_add_neon( void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8_t *d1, *d2; uint8_t *d1, *d2;
uint8x8_t d0u8, d1u8, d2u8, d3u8; uint8x8_t d0u8, d1u8, d2u8, d3u8;
uint64x1_t d0u64, d1u64, d2u64, d3u64; uint64x1_t d0u64, d1u64, d2u64, d3u64;
@ -257,17 +244,17 @@ void vpx_idct8x8_64_add_neon(
q14s16 = vld1q_s16(input + 48); q14s16 = vld1q_s16(input + 48);
q15s16 = vld1q_s16(input + 56); q15s16 = vld1q_s16(input + 56);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
q8s16 = vrshrq_n_s16(q8s16, 5); q8s16 = vrshrq_n_s16(q8s16, 5);
q9s16 = vrshrq_n_s16(q9s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5);
@ -289,14 +276,10 @@ void vpx_idct8x8_64_add_neon(
d3u64 = vld1_u64((uint64_t *)d1); d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride; d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
vreinterpret_u8_u64(d1u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
@ -326,14 +309,10 @@ void vpx_idct8x8_64_add_neon(
d3u64 = vld1_u64((uint64_t *)d1); d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride; d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
vreinterpret_u8_u64(d1u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
@ -351,10 +330,7 @@ void vpx_idct8x8_64_add_neon(
return; return;
} }
void vpx_idct8x8_12_add_neon( void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8_t *d1, *d2; uint8_t *d1, *d2;
uint8x8_t d0u8, d1u8, d2u8, d3u8; uint8x8_t d0u8, d1u8, d2u8, d3u8;
int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
@ -374,8 +350,8 @@ void vpx_idct8x8_12_add_neon(
q14s16 = vld1q_s16(input + 48); q14s16 = vld1q_s16(input + 48);
q15s16 = vld1q_s16(input + 56); q15s16 = vld1q_s16(input + 56);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
// First transform rows // First transform rows
// stage 1 // stage 1
@ -451,11 +427,11 @@ void vpx_idct8x8_12_add_neon(
q14s16 = vsubq_s16(q1s16, q6s16); q14s16 = vsubq_s16(q1s16, q6s16);
q15s16 = vsubq_s16(q0s16, q7s16); q15s16 = vsubq_s16(q0s16, q7s16);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q12s16, &q13s16, &q14s16, &q15s16); &q15s16);
q8s16 = vrshrq_n_s16(q8s16, 5); q8s16 = vrshrq_n_s16(q8s16, 5);
q9s16 = vrshrq_n_s16(q9s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5);
@ -477,14 +453,10 @@ void vpx_idct8x8_12_add_neon(
d3u64 = vld1_u64((uint64_t *)d1); d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride; d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
vreinterpret_u8_u64(d1u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
@ -514,14 +486,10 @@ void vpx_idct8x8_12_add_neon(
d3u64 = vld1_u64((uint64_t *)d1); d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride; d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
vreinterpret_u8_u64(d1u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

View File

@ -18,9 +18,8 @@
// DC 4x4 // DC 4x4
// 'do_above' and 'do_left' facilitate branch removal when inlined. // 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
const uint8_t *above, const uint8_t *left, const uint8_t *left, int do_above, int do_left) {
int do_above, int do_left) {
uint16x8_t sum_top; uint16x8_t sum_top;
uint16x8_t sum_left; uint16x8_t sum_left;
uint8x8_t dc0; uint8x8_t dc0;
@ -54,7 +53,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
const uint8x8_t dc = vdup_lane_u8(dc0, 0); const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i; int i;
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
} }
} }
} }
@ -87,9 +86,8 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
// DC 8x8 // DC 8x8
// 'do_above' and 'do_left' facilitate branch removal when inlined. // 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
const uint8_t *above, const uint8_t *left, const uint8_t *left, int do_above, int do_left) {
int do_above, int do_left) {
uint16x8_t sum_top; uint16x8_t sum_top;
uint16x8_t sum_left; uint16x8_t sum_left;
uint8x8_t dc0; uint8x8_t dc0;
@ -125,7 +123,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
const uint8x8_t dc = vdup_lane_u8(dc0, 0); const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i; int i;
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
} }
} }
} }
@ -425,8 +423,7 @@ void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
(void)left; (void)left;
d0u8 = vld1_u8(above); d0u8 = vld1_u8(above);
for (i = 0; i < 8; i++, dst += stride) for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
vst1_u8(dst, d0u8);
} }
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
@ -436,8 +433,7 @@ void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
(void)left; (void)left;
q0u8 = vld1q_u8(above); q0u8 = vld1q_u8(above);
for (i = 0; i < 16; i++, dst += stride) for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
vst1q_u8(dst, q0u8);
} }
void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
@ -608,8 +604,8 @@ void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
for (i = 0; i < 4; i++, dst += stride) { for (i = 0; i < 4; i++, dst += stride) {
q1u16 = vdupq_n_u16((uint16_t)left[i]); q1u16 = vdupq_n_u16((uint16_t)left[i]);
q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), q1s16 =
vreinterpretq_s16_u16(q3u16)); vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
d0u8 = vqmovun_s16(q1s16); d0u8 = vqmovun_s16(q1s16);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
} }
@ -631,26 +627,26 @@ void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
d20u16 = vget_low_u16(q10u16); d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0); q0u16 = vdupq_lane_u16(d20u16, 0);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), q0s16 =
vreinterpretq_s16_u16(q0u16)); vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16); d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride; dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 1); q0u16 = vdupq_lane_u16(d20u16, 1);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), q0s16 =
vreinterpretq_s16_u16(q0u16)); vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16); d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride; dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 2); q0u16 = vdupq_lane_u16(d20u16, 2);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), q0s16 =
vreinterpretq_s16_u16(q0u16)); vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16); d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride; dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 3); q0u16 = vdupq_lane_u16(d20u16, 3);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), q0s16 =
vreinterpretq_s16_u16(q0u16)); vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16); d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride; dst += stride;
@ -677,14 +673,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0); q0u16 = vdupq_lane_u16(d20u16, 0);
q8u16 = vdupq_lane_u16(d20u16, 1); q8u16 = vdupq_lane_u16(d20u16, 1);
q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q1s16 =
vreinterpretq_s16_u16(q2u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q0s16 =
vreinterpretq_s16_u16(q3u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), q11s16 =
vreinterpretq_s16_u16(q2u16)); vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), q8s16 =
vreinterpretq_s16_u16(q3u16)); vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16); d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16); d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16); d22u8 = vqmovun_s16(q11s16);
@ -698,14 +694,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
q0u16 = vdupq_lane_u16(d20u16, 2); q0u16 = vdupq_lane_u16(d20u16, 2);
q8u16 = vdupq_lane_u16(d20u16, 3); q8u16 = vdupq_lane_u16(d20u16, 3);
q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q1s16 =
vreinterpretq_s16_u16(q2u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q0s16 =
vreinterpretq_s16_u16(q3u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), q11s16 =
vreinterpretq_s16_u16(q2u16)); vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), q8s16 =
vreinterpretq_s16_u16(q3u16)); vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16); d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16); d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16); d22u8 = vqmovun_s16(q11s16);
@ -742,10 +738,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
d6u16 = vget_low_u16(q3u16); d6u16 = vget_low_u16(q3u16);
for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
q0u16 = vdupq_lane_u16(d6u16, 0); q0u16 = vdupq_lane_u16(d6u16, 0);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q12s16 =
vreinterpretq_s16_u16(q8u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q13s16 =
vreinterpretq_s16_u16(q9u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16)); vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@ -761,10 +757,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride; dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 1); q0u16 = vdupq_lane_u16(d6u16, 1);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q12s16 =
vreinterpretq_s16_u16(q8u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q13s16 =
vreinterpretq_s16_u16(q9u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16)); vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@ -780,10 +776,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride; dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 2); q0u16 = vdupq_lane_u16(d6u16, 2);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q12s16 =
vreinterpretq_s16_u16(q8u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q13s16 =
vreinterpretq_s16_u16(q9u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16)); vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@ -799,10 +795,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
dst += stride; dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 3); q0u16 = vdupq_lane_u16(d6u16, 3);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q12s16 =
vreinterpretq_s16_u16(q8u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q13s16 =
vreinterpretq_s16_u16(q9u16)); vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16)); vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),

View File

@ -14,8 +14,7 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
static INLINE void loop_filter_neon_16( static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
uint8x16_t qblimit, // blimit
uint8x16_t qlimit, // limit uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh uint8x16_t qthresh, // thresh
uint8x16_t q3, // p3 uint8x16_t q3, // p3
@ -78,8 +77,7 @@ static INLINE void loop_filter_neon_16(
q9 = vcgeq_u8(qblimit, q9); q9 = vcgeq_u8(qblimit, q9);
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
vreinterpretq_s8_u8(q8));
q14u8 = vorrq_u8(q13u8, q14u8); q14u8 = vorrq_u8(q13u8, q14u8);
@ -124,13 +122,10 @@ static INLINE void loop_filter_neon_16(
return; return;
} }
void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, void vpx_lpf_horizontal_4_dual_neon(
const uint8_t *blimit0, uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *thresh0, const uint8_t *limit1, const uint8_t *thresh1) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
uint8x16_t qblimit, qlimit, qthresh; uint8x16_t qblimit, qlimit, qthresh;
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
@ -163,9 +158,8 @@ void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
s += p; s += p;
q10u8 = vld1q_u8(s); q10u8 = vld1q_u8(s);
loop_filter_neon_16(qblimit, qlimit, qthresh, loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
&q5u8, &q6u8, &q7u8, &q8u8);
s -= (p * 5); s -= (p * 5);
vst1q_u8(s, q5u8); vst1q_u8(s, q5u8);

View File

@ -12,8 +12,7 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
static INLINE void loop_filter_neon( static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
uint8x8_t dblimit, // flimit
uint8x8_t dlimit, // limit uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p3 uint8x8_t d3u8, // p3
@ -66,13 +65,11 @@ static INLINE void loop_filter_neon(
d19u8 = vdup_n_u8(3); d19u8 = vdup_n_u8(3);
d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
vreinterpret_s8_u8(d6u8));
d17u8 = vcge_u8(dblimit, d17u8); d17u8 = vcge_u8(dblimit, d17u8);
d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
vreinterpret_s8_u8(d16u8));
d22u8 = vorr_u8(d21u8, d22u8); d22u8 = vorr_u8(d21u8, d22u8);
@ -110,12 +107,8 @@ static INLINE void loop_filter_neon(
return; return;
} }
void vpx_lpf_horizontal_4_neon( void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
uint8_t *src, const uint8_t *limit, const uint8_t *thresh) {
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
int i; int i;
uint8_t *s, *psrc; uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh; uint8x8_t dblimit, dlimit, dthresh;
@ -145,9 +138,8 @@ void vpx_lpf_horizontal_4_neon(
s += pitch; s += pitch;
d18u8 = vld1_u8(s); d18u8 = vld1_u8(s);
loop_filter_neon(dblimit, dlimit, dthresh, loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
&d4u8, &d5u8, &d6u8, &d7u8);
s -= (pitch * 5); s -= (pitch * 5);
vst1_u8(s, d4u8); vst1_u8(s, d4u8);
@ -161,12 +153,8 @@ void vpx_lpf_horizontal_4_neon(
return; return;
} }
void vpx_lpf_vertical_4_neon( void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
uint8_t *src, const uint8_t *limit, const uint8_t *thresh) {
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
int i, pitch8; int i, pitch8;
uint8_t *s; uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh; uint8x8_t dblimit, dlimit, dthresh;
@ -200,14 +188,10 @@ void vpx_lpf_vertical_4_neon(
s += pitch; s += pitch;
d18u8 = vld1_u8(s); d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
vreinterpret_u32_u8(d7u8)); d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
vreinterpret_u32_u8(d16u8)); d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0])); vreinterpret_u16_u32(d2tmp2.val[0]));
@ -236,9 +220,8 @@ void vpx_lpf_vertical_4_neon(
d17u8 = d2tmp11.val[0]; d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1]; d18u8 = d2tmp11.val[1];
loop_filter_neon(dblimit, dlimit, dthresh, loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
&d4u8, &d5u8, &d6u8, &d7u8);
d4Result.val[0] = d4u8; d4Result.val[0] = d4u8;
d4Result.val[1] = d5u8; d4Result.val[1] = d5u8;

View File

@ -12,8 +12,7 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
static INLINE void mbloop_filter_neon( static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
uint8x8_t dblimit, // mblimit
uint8x8_t dlimit, // limit uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p2 uint8x8_t d3u8, // p2
@ -64,10 +63,8 @@ static INLINE void mbloop_filter_neon(
d23u8 = vabd_u8(d5u8, d16u8); d23u8 = vabd_u8(d5u8, d16u8);
d24u8 = vqadd_u8(d24u8, d24u8); d24u8 = vqadd_u8(d24u8, d24u8);
d19u8 = vcge_u8(dlimit, d19u8); d19u8 = vcge_u8(dlimit, d19u8);
d25u8 = vmax_u8(d25u8, d26u8); d25u8 = vmax_u8(d25u8, d26u8);
d26u8 = vmax_u8(d27u8, d28u8); d26u8 = vmax_u8(d27u8, d28u8);
@ -96,8 +93,7 @@ static INLINE void mbloop_filter_neon(
d23u8 = vorr_u8(d21u8, d23u8); d23u8 = vorr_u8(d21u8, d23u8);
q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
vreinterpret_u16_u8(d21u8));
d30u8 = vshrn_n_u16(q10u16, 4); d30u8 = vshrn_n_u16(q10u16, 4);
flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
@ -263,12 +259,8 @@ static INLINE void mbloop_filter_neon(
return; return;
} }
void vpx_lpf_horizontal_8_neon( void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
uint8_t *src, const uint8_t *limit, const uint8_t *thresh) {
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
int i; int i;
uint8_t *s, *psrc; uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh; uint8x8_t dblimit, dlimit, dthresh;
@ -299,9 +291,9 @@ void vpx_lpf_horizontal_8_neon(
s += pitch; s += pitch;
d18u8 = vld1_u8(s); d18u8 = vld1_u8(s);
mbloop_filter_neon(dblimit, dlimit, dthresh, mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
&d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); &d5u8);
s -= (pitch * 6); s -= (pitch * 6);
vst1_u8(s, d0u8); vst1_u8(s, d0u8);
@ -319,12 +311,8 @@ void vpx_lpf_horizontal_8_neon(
return; return;
} }
void vpx_lpf_vertical_8_neon( void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
uint8_t *src, const uint8_t *limit, const uint8_t *thresh) {
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
int i; int i;
uint8_t *s; uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh; uint8x8_t dblimit, dlimit, dthresh;
@ -359,14 +347,10 @@ void vpx_lpf_vertical_8_neon(
s += pitch; s += pitch;
d18u8 = vld1_u8(s); d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
vreinterpret_u32_u8(d7u8)); d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
vreinterpret_u32_u8(d16u8)); d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0])); vreinterpret_u16_u32(d2tmp2.val[0]));
@ -395,9 +379,9 @@ void vpx_lpf_vertical_8_neon(
d17u8 = d2tmp11.val[0]; d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1]; d18u8 = d2tmp11.val[1];
mbloop_filter_neon(dblimit, dlimit, dthresh, mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
&d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); &d5u8);
d4Result.val[0] = d0u8; d4Result.val[0] = d0u8;
d4Result.val[1] = d1u8; d4Result.val[1] = d1u8;

View File

@ -14,42 +14,32 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *limit0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) { const uint8_t *thresh1) {
vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
} }
#if HAVE_NEON_ASM #if HAVE_NEON_ASM
void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, void vpx_lpf_horizontal_8_dual_neon(
const uint8_t *blimit0, uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *thresh0, const uint8_t *limit1, const uint8_t *thresh1) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
} }
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *limit0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) { const uint8_t *thresh1) {
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
} }
void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh) { const uint8_t *thresh) {
vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh); vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);

View File

@ -16,10 +16,10 @@
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) { const uint16x8_t vec_hi) {
const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), const uint32x4_t vec_l_lo =
vget_high_u16(vec_lo)); vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), const uint32x4_t vec_l_hi =
vget_high_u16(vec_hi)); vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a); const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@ -33,8 +33,7 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
static void sad_neon_64(const uint8x16_t vec_src_00, static void sad_neon_64(const uint8x16_t vec_src_00,
const uint8x16_t vec_src_16, const uint8x16_t vec_src_16,
const uint8x16_t vec_src_32, const uint8x16_t vec_src_32,
const uint8x16_t vec_src_48, const uint8x16_t vec_src_48, const uint8_t *ref,
const uint8_t *ref,
uint16x8_t *vec_sum_ref_lo, uint16x8_t *vec_sum_ref_lo,
uint16x8_t *vec_sum_ref_hi) { uint16x8_t *vec_sum_ref_hi) {
const uint8x16_t vec_ref_00 = vld1q_u8(ref); const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@ -63,8 +62,7 @@ static void sad_neon_64(const uint8x16_t vec_src_00,
// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
static void sad_neon_32(const uint8x16_t vec_src_00, static void sad_neon_32(const uint8x16_t vec_src_00,
const uint8x16_t vec_src_16, const uint8x16_t vec_src_16, const uint8_t *ref,
const uint8_t *ref,
uint16x8_t *vec_sum_ref_lo, uint16x8_t *vec_sum_ref_lo,
uint16x8_t *vec_sum_ref_hi) { uint16x8_t *vec_sum_ref_hi) {
const uint8x16_t vec_ref_00 = vld1q_u8(ref); const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@ -81,7 +79,7 @@ static void sad_neon_32(const uint8x16_t vec_src_00,
} }
void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
const uint8_t* const ref[4], int ref_stride, const uint8_t *const ref[4], int ref_stride,
uint32_t *res) { uint32_t *res) {
int i; int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@ -127,7 +125,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
} }
void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8_t* const ref[4], int ref_stride, const uint8_t *const ref[4], int ref_stride,
uint32_t *res) { uint32_t *res) {
int i; int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@ -148,14 +146,14 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8x16_t vec_src_00 = vld1q_u8(src); const uint8x16_t vec_src_00 = vld1q_u8(src);
const uint8x16_t vec_src_16 = vld1q_u8(src + 16); const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
sad_neon_32(vec_src_00, vec_src_16, ref0, sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
&vec_sum_ref0_lo, &vec_sum_ref0_hi); &vec_sum_ref0_hi);
sad_neon_32(vec_src_00, vec_src_16, ref1, sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
&vec_sum_ref1_lo, &vec_sum_ref1_hi); &vec_sum_ref1_hi);
sad_neon_32(vec_src_00, vec_src_16, ref2, sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
&vec_sum_ref2_lo, &vec_sum_ref2_hi); &vec_sum_ref2_hi);
sad_neon_32(vec_src_00, vec_src_16, ref3, sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
&vec_sum_ref3_lo, &vec_sum_ref3_hi); &vec_sum_ref3_hi);
src += src_stride; src += src_stride;
ref0 += ref_stride; ref0 += ref_stride;
@ -171,7 +169,7 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
} }
void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
const uint8_t* const ref[4], int ref_stride, const uint8_t *const ref[4], int ref_stride,
uint32_t *res) { uint32_t *res) {
int i; int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@ -195,20 +193,20 @@ void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
const uint8x16_t vec_ref2 = vld1q_u8(ref2); const uint8x16_t vec_ref2 = vld1q_u8(ref2);
const uint8x16_t vec_ref3 = vld1q_u8(ref3); const uint8x16_t vec_ref3 = vld1q_u8(ref3);
vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vec_sum_ref0_lo =
vget_low_u8(vec_ref0)); vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref0)); vget_high_u8(vec_ref0));
vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vec_sum_ref1_lo =
vget_low_u8(vec_ref1)); vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref1)); vget_high_u8(vec_ref1));
vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vec_sum_ref2_lo =
vget_low_u8(vec_ref2)); vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref2)); vget_high_u8(vec_ref2));
vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vec_sum_ref3_lo =
vget_low_u8(vec_ref3)); vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref3)); vget_high_u8(vec_ref3));

View File

@ -14,11 +14,8 @@
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
unsigned int vpx_sad8x16_neon( unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
unsigned char *src_ptr, unsigned char *ref_ptr, int ref_stride) {
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8; uint8x8_t d0, d8;
uint16x8_t q12; uint16x8_t q12;
uint32x4_t q1; uint32x4_t q1;
@ -48,11 +45,8 @@ unsigned int vpx_sad8x16_neon(
return vget_lane_u32(d5, 0); return vget_lane_u32(d5, 0);
} }
unsigned int vpx_sad4x4_neon( unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
unsigned char *src_ptr, unsigned char *ref_ptr, int ref_stride) {
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8; uint8x8_t d0, d8;
uint16x8_t q12; uint16x8_t q12;
uint32x2_t d1; uint32x2_t d1;
@ -79,11 +73,8 @@ unsigned int vpx_sad4x4_neon(
return vget_lane_u32(vreinterpret_u32_u64(d3), 0); return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
} }
unsigned int vpx_sad16x8_neon( unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
unsigned char *src_ptr, unsigned char *ref_ptr, int ref_stride) {
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4; uint8x16_t q0, q4;
uint16x8_t q12, q13; uint16x8_t q12, q13;
uint32x4_t q1; uint32x4_t q1;
@ -118,10 +109,10 @@ unsigned int vpx_sad16x8_neon(
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) { const uint16x8_t vec_hi) {
const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), const uint32x4_t vec_l_lo =
vget_high_u16(vec_lo)); vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), const uint32x4_t vec_l_hi =
vget_high_u16(vec_hi)); vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a); const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@ -208,10 +199,10 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
const uint8x16_t vec_ref = vld1q_u8(ref); const uint8x16_t vec_ref = vld1q_u8(ref);
src += src_stride; src += src_stride;
ref += ref_stride; ref += ref_stride;
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vec_accum_lo =
vget_low_u8(vec_ref)); vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vec_accum_hi =
vget_high_u8(vec_ref)); vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
} }
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
} }

View File

@ -14,91 +14,66 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_MEDIA #if HAVE_MEDIA
static const int16_t bilinear_filters_media[8][2] = { static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
{ 128, 0 }, { 96, 32 }, { 80, 48 },
{ 112, 16 }, { 64, 64 }, { 48, 80 },
{ 96, 32 }, { 32, 96 }, { 16, 112 } };
{ 80, 48 },
{ 64, 64 },
{ 48, 80 },
{ 32, 96 },
{ 16, 112 }
};
extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr, extern void vpx_filter_block2d_bil_first_pass_media(
uint16_t *dst_ptr, const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
uint32_t src_pitch, uint32_t height, uint32_t width, const int16_t *filter);
uint32_t height,
uint32_t width,
const int16_t *filter);
extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr, extern void vpx_filter_block2d_bil_second_pass_media(
uint8_t *dst_ptr, const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
int32_t src_pitch, uint32_t height, uint32_t width, const int16_t *filter);
uint32_t height,
uint32_t width,
const int16_t *filter);
unsigned int vpx_sub_pixel_variance8x8_media(
unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr, const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
int src_pixels_per_line, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
int xoffset, int yoffset, uint16_t first_pass[10 * 8];
const uint8_t *dst_ptr, uint8_t second_pass[8 * 8];
int dst_pixels_per_line,
unsigned int *sse) {
uint16_t first_pass[10*8];
uint8_t second_pass[8*8];
const int16_t *HFilter, *VFilter; const int16_t *HFilter, *VFilter;
HFilter = bilinear_filters_media[xoffset]; HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset]; VFilter = bilinear_filters_media[yoffset];
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
src_pixels_per_line, src_pixels_per_line, 9, 8, HFilter);
9, 8, HFilter); vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, VFilter);
8, 8, 8, VFilter);
return vpx_variance8x8_media(second_pass, 8, dst_ptr, return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
dst_pixels_per_line, sse); sse);
} }
unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr, unsigned int vpx_sub_pixel_variance16x16_media(
int src_pixels_per_line, const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
int xoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
int yoffset, uint16_t first_pass[36 * 16];
const uint8_t *dst_ptr, uint8_t second_pass[20 * 16];
int dst_pixels_per_line,
unsigned int *sse) {
uint16_t first_pass[36*16];
uint8_t second_pass[20*16];
const int16_t *HFilter, *VFilter; const int16_t *HFilter, *VFilter;
unsigned int var; unsigned int var;
if (xoffset == 4 && yoffset == 0) { if (xoffset == 4 && yoffset == 0) {
var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line, var = vpx_variance_halfpixvar16x16_h_media(
dst_ptr, dst_pixels_per_line, src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
sse);
} else if (xoffset == 0 && yoffset == 4) { } else if (xoffset == 0 && yoffset == 4) {
var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line, var = vpx_variance_halfpixvar16x16_v_media(
dst_ptr, dst_pixels_per_line, src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
sse);
} else if (xoffset == 4 && yoffset == 4) { } else if (xoffset == 4 && yoffset == 4) {
var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line, var = vpx_variance_halfpixvar16x16_hv_media(
dst_ptr, dst_pixels_per_line, src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
sse);
} else { } else {
HFilter = bilinear_filters_media[xoffset]; HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset]; VFilter = bilinear_filters_media[yoffset];
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, vpx_filter_block2d_bil_first_pass_media(
src_pixels_per_line, src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
17, 16, HFilter); vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, VFilter);
16, 16, 16, VFilter);
var = vpx_variance16x16_media(second_pass, 16, dst_ptr, var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
dst_pixels_per_line, sse); sse);
} }
return var; return var;
} }

View File

@ -18,14 +18,8 @@
#include "vpx_dsp/variance.h" #include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = { static const uint8_t bilinear_filters[8][2] = {
{ 128, 0, }, { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
{ 112, 16, }, { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
{ 96, 32, },
{ 80, 48, },
{ 64, 64, },
{ 48, 80, },
{ 32, 96, },
{ 16, 112, },
}; };
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@ -79,74 +73,61 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
} }
} }
unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
int src_stride, int xoffset, int yoffset,
int xoffset, const uint8_t *dst, int dst_stride,
int yoffset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) { unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
9, 8,
bilinear_filters[xoffset]); bilinear_filters[xoffset]);
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
8, bilinear_filters[yoffset]); bilinear_filters[yoffset]);
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
} }
unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
int src_stride, int src_stride, int xoffset,
int xoffset, int yoffset, const uint8_t *dst,
int yoffset,
const uint8_t *dst,
int dst_stride, int dst_stride,
unsigned int *sse) { unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
17, 16,
bilinear_filters[xoffset]); bilinear_filters[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
16, bilinear_filters[yoffset]); bilinear_filters[yoffset]);
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
} }
unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
int src_stride, int src_stride, int xoffset,
int xoffset, int yoffset, const uint8_t *dst,
int yoffset,
const uint8_t *dst,
int dst_stride, int dst_stride,
unsigned int *sse) { unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
33, 32,
bilinear_filters[xoffset]); bilinear_filters[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
32, bilinear_filters[yoffset]); bilinear_filters[yoffset]);
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
} }
unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
int src_stride, int src_stride, int xoffset,
int xoffset, int yoffset, const uint8_t *dst,
int yoffset,
const uint8_t *dst,
int dst_stride, int dst_stride,
unsigned int *sse) { unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
65, 64,
bilinear_filters[xoffset]); bilinear_filters[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
64, bilinear_filters[yoffset]); bilinear_filters[yoffset]);
return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
} }

View File

@ -13,10 +13,10 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
void vpx_subtract_block_neon(int rows, int cols, void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
int16_t *diff, ptrdiff_t diff_stride, ptrdiff_t diff_stride, const uint8_t *src,
const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t src_stride, const uint8_t *pred,
const uint8_t *pred, ptrdiff_t pred_stride) { ptrdiff_t pred_stride) {
int r, c; int r, c;
if (cols > 16) { if (cols > 16) {
@ -26,14 +26,14 @@ void vpx_subtract_block_neon(int rows, int cols,
const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), const uint16x8_t v_diff_lo_00 =
vget_low_u8(v_pred_00)); vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), const uint16x8_t v_diff_hi_00 =
vget_high_u8(v_pred_00)); vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), const uint16x8_t v_diff_lo_16 =
vget_low_u8(v_pred_16)); vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), const uint16x8_t v_diff_hi_16 =
vget_high_u8(v_pred_16)); vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
@ -47,10 +47,10 @@ void vpx_subtract_block_neon(int rows, int cols,
for (r = 0; r < rows; ++r) { for (r = 0; r < rows; ++r) {
const uint8x16_t v_src = vld1q_u8(&src[0]); const uint8x16_t v_src = vld1q_u8(&src[0]);
const uint8x16_t v_pred = vld1q_u8(&pred[0]); const uint8x16_t v_pred = vld1q_u8(&pred[0]);
const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), const uint16x8_t v_diff_lo =
vget_low_u8(v_pred)); vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), const uint16x8_t v_diff_hi =
vget_high_u8(v_pred)); vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
diff += diff_stride; diff += diff_stride;
@ -69,8 +69,7 @@ void vpx_subtract_block_neon(int rows, int cols,
} }
} else { } else {
for (r = 0; r < rows; ++r) { for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; ++c) for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
diff[c] = src[c] - pred[c];
diff += diff_stride; diff += diff_stride;
pred += pred_stride; pred += pred_stride;

View File

@ -32,9 +32,9 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
} }
// w * h must be less than 2048 or local variable v_sum may overflow. // w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride, static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
const uint8_t *b, int b_stride, int b_stride, int w, int h, uint32_t *sse,
int w, int h, uint32_t *sse, int *sum) { int *sum) {
int i, j; int i, j;
int16x8_t v_sum = vdupq_n_s16(0); int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0); int32x4_t v_sse_lo = vdupq_n_s32(0);
@ -47,12 +47,10 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
const uint16x8_t v_diff = vsubl_u8(v_a, v_b); const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
v_sum = vaddq_s16(v_sum, sv_diff); v_sum = vaddq_s16(v_sum, sv_diff);
v_sse_lo = vmlal_s16(v_sse_lo, v_sse_lo =
vget_low_s16(sv_diff), vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
vget_low_s16(sv_diff)); v_sse_hi =
v_sse_hi = vmlal_s16(v_sse_hi, vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
vget_high_s16(sv_diff),
vget_high_s16(sv_diff));
} }
a += a_stride; a += a_stride;
b += b_stride; b += b_stride;
@ -62,15 +60,13 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
} }
void vpx_get8x8var_neon(const uint8_t *a, int a_stride, void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
const uint8_t *b, int b_stride, int b_stride, unsigned int *sse, int *sum) {
unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
} }
void vpx_get16x16var_neon(const uint8_t *a, int a_stride, void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
const uint8_t *b, int b_stride, int b_stride, unsigned int *sse, int *sum) {
unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
} }
@ -104,9 +100,8 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
int sum1, sum2; int sum1, sum2;
uint32_t sse1, sse2; uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
variance_neon_w8(a + (32 * a_stride), a_stride, variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
b + (32 * b_stride), b_stride, 32, 32, 32, 32, &sse2, &sum2);
&sse2, &sum2);
*sse = sse1 + sse2; *sse = sse1 + sse2;
sum1 += sum2; sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
@ -118,9 +113,8 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
int sum1, sum2; int sum1, sum2;
uint32_t sse1, sse2; uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride, variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
b + (16 * b_stride), b_stride, 64, 16, 64, 16, &sse2, &sum2);
&sse2, &sum2);
*sse = sse1 + sse2; *sse = sse1 + sse2;
sum1 += sum2; sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
@ -133,32 +127,27 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
uint32_t sse1, sse2; uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride, variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
b + (16 * b_stride), b_stride, 64, 16,
&sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
b + (16 * 2 * b_stride), b_stride,
64, 16, &sse2, &sum2); 64, 16, &sse2, &sum2);
sse1 += sse2; sse1 += sse2;
sum1 += sum2; sum1 += sum2;
variance_neon_w8(a + (16 * 3 * a_stride), a_stride, variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
b + (16 * 3 * b_stride), b_stride, b_stride, 64, 16, &sse2, &sum2);
64, 16, &sse2, &sum2); sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
b_stride, 64, 16, &sse2, &sum2);
*sse = sse1 + sse2; *sse = sse1 + sse2;
sum1 += sum2; sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
} }
unsigned int vpx_variance16x8_neon( unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
const unsigned char *src_ptr,
int source_stride, int source_stride,
const unsigned char *ref_ptr, const unsigned char *ref_ptr,
int recon_stride, int recon_stride, unsigned int *sse) {
unsigned int *sse) {
int i; int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32; uint32x2_t d0u32, d10u32;
@ -222,8 +211,7 @@ unsigned int vpx_variance16x8_neon(
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
@ -232,12 +220,10 @@ unsigned int vpx_variance16x8_neon(
return vget_lane_u32(d0u32, 0); return vget_lane_u32(d0u32, 0);
} }
unsigned int vpx_variance8x16_neon( unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
const unsigned char *src_ptr,
int source_stride, int source_stride,
const unsigned char *ref_ptr, const unsigned char *ref_ptr,
int recon_stride, int recon_stride, unsigned int *sse) {
unsigned int *sse) {
int i; int i;
uint8x8_t d0u8, d2u8, d4u8, d6u8; uint8x8_t d0u8, d2u8, d4u8, d6u8;
int16x4_t d22s16, d23s16, d24s16, d25s16; int16x4_t d22s16, d23s16, d24s16, d25s16;
@ -287,8 +273,7 @@ unsigned int vpx_variance8x16_neon(
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
@ -297,11 +282,8 @@ unsigned int vpx_variance8x16_neon(
return vget_lane_u32(d0u32, 0); return vget_lane_u32(d0u32, 0);
} }
unsigned int vpx_mse16x16_neon( unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
const unsigned char *src_ptr, const unsigned char *ref_ptr, int recon_stride,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) { unsigned int *sse) {
int i; int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
@ -363,8 +345,7 @@ unsigned int vpx_mse16x16_neon(
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
} }
unsigned int vpx_get4x4sse_cs_neon( unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
const unsigned char *src_ptr,
int source_stride, int source_stride,
const unsigned char *ref_ptr, const unsigned char *ref_ptr,
int recon_stride) { int recon_stride) {

View File

@ -16,15 +16,10 @@
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
static INLINE int32x4_t MULTIPLY_BY_Q0( static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
int16x4_t dsrc0, int16x4_t dsrc2, int16x4_t dsrc3,
int16x4_t dsrc1, int16x4_t dsrc4, int16x4_t dsrc5,
int16x4_t dsrc2, int16x4_t dsrc6, int16x4_t dsrc7,
int16x4_t dsrc3,
int16x4_t dsrc4,
int16x4_t dsrc5,
int16x4_t dsrc6,
int16x4_t dsrc7,
int16x8_t q0s16) { int16x8_t q0s16) {
int32x4_t qdst; int32x4_t qdst;
int16x4_t d0s16, d1s16; int16x4_t d0s16, d1s16;
@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
return qdst; return qdst;
} }
void vpx_convolve8_avg_horiz_neon( void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
const uint8_t *src, uint8_t *dst, ptrdiff_t dst_stride,
ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4,
uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x,
int x_step_q4,
const int16_t *filter_y, // unused const int16_t *filter_y, // unused
int y_step_q4, // unused int y_step_q4, // unused
int w, int w, int h) {
int h) {
int width; int width;
const uint8_t *s; const uint8_t *s;
uint8_t *d; uint8_t *d;
@ -90,8 +80,8 @@ void vpx_convolve8_avg_horiz_neon(
q12u8 = vcombine_u8(d24u8, d25u8); q12u8 = vcombine_u8(d24u8, d25u8);
q13u8 = vcombine_u8(d26u8, d27u8); q13u8 = vcombine_u8(d26u8, d27u8);
q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), q0x2u16 =
vreinterpretq_u16_u8(q13u8)); vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@ -117,9 +107,7 @@ void vpx_convolve8_avg_horiz_neon(
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
for (width = w; for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz
width > 0;
width -= 4, src += 4, dst += 4) { // loop_horiz
s = src; s = src;
d28u32 = vld1_dup_u32((const uint32_t *)s); d28u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride; s += src_stride;
@ -131,10 +119,10 @@ void vpx_convolve8_avg_horiz_neon(
__builtin_prefetch(src + 64); __builtin_prefetch(src + 64);
d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), d0x2u16 =
vreinterpret_u16_u32(d31u32)); vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), d1x2u16 =
vreinterpret_u16_u32(d30u32)); vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
vreinterpret_u8_u16(d1x2u16.val[0])); // d29 vreinterpret_u8_u16(d1x2u16.val[0])); // d29
d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
@ -144,8 +132,8 @@ void vpx_convolve8_avg_horiz_neon(
q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), q0x2u32 =
vreinterpretq_u32_u8(q15u8)); vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@ -173,14 +161,14 @@ void vpx_convolve8_avg_horiz_neon(
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
d18s16, d19s16, d23s16, d24s16, q0s16); d23s16, d24s16, q0s16);
q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
d19s16, d23s16, d24s16, d26s16, q0s16); d24s16, d26s16, q0s16);
q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
d23s16, d24s16, d26s16, d27s16, q0s16); d26s16, d27s16, q0s16);
q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
d24s16, d26s16, d27s16, d25s16, q0s16); d27s16, d25s16, q0s16);
__builtin_prefetch(src + 64 + src_stride * 3); __builtin_prefetch(src + 64 + src_stride * 3);
@ -195,8 +183,7 @@ void vpx_convolve8_avg_horiz_neon(
d2u8 = vqmovn_u16(q1u16); d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16); d3u8 = vqmovn_u16(q2u16);
d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
vreinterpret_u16_u8(d3u8));
d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
vreinterpret_u32_u16(d0x2u16.val[1])); vreinterpret_u32_u16(d0x2u16.val[1]));
d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@ -231,16 +218,11 @@ void vpx_convolve8_avg_horiz_neon(
return; return;
} }
void vpx_convolve8_avg_vert_neon( void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
const uint8_t *src, uint8_t *dst, ptrdiff_t dst_stride,
ptrdiff_t src_stride,
uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, // unused const int16_t *filter_x, // unused
int x_step_q4, // unused int x_step_q4, // unused
const int16_t *filter_y, const int16_t *filter_y, int y_step_q4, int w,
int y_step_q4,
int w,
int h) { int h) {
int height; int height;
const uint8_t *s; const uint8_t *s;
@ -319,20 +301,20 @@ void vpx_convolve8_avg_vert_neon(
__builtin_prefetch(s); __builtin_prefetch(s);
__builtin_prefetch(s + src_stride); __builtin_prefetch(s + src_stride);
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
d20s16, d21s16, d22s16, d24s16, q0s16); d22s16, d24s16, q0s16);
__builtin_prefetch(s + src_stride * 2); __builtin_prefetch(s + src_stride * 2);
__builtin_prefetch(s + src_stride * 3); __builtin_prefetch(s + src_stride * 3);
q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
d21s16, d22s16, d24s16, d26s16, q0s16); d24s16, d26s16, q0s16);
__builtin_prefetch(d); __builtin_prefetch(d);
__builtin_prefetch(d + dst_stride); __builtin_prefetch(d + dst_stride);
q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
d22s16, d24s16, d26s16, d27s16, q0s16); d26s16, d27s16, q0s16);
__builtin_prefetch(d + dst_stride * 2); __builtin_prefetch(d + dst_stride * 2);
__builtin_prefetch(d + dst_stride * 3); __builtin_prefetch(d + dst_stride * 3);
q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
d24s16, d26s16, d27s16, d25s16, q0s16); d27s16, d25s16, q0s16);
d2u16 = vqrshrun_n_s32(q1s32, 7); d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7);

View File

@ -16,15 +16,10 @@
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
static INLINE int32x4_t MULTIPLY_BY_Q0( static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
int16x4_t dsrc0, int16x4_t dsrc2, int16x4_t dsrc3,
int16x4_t dsrc1, int16x4_t dsrc4, int16x4_t dsrc5,
int16x4_t dsrc2, int16x4_t dsrc6, int16x4_t dsrc7,
int16x4_t dsrc3,
int16x4_t dsrc4,
int16x4_t dsrc5,
int16x4_t dsrc6,
int16x4_t dsrc7,
int16x8_t q0s16) { int16x8_t q0s16) {
int32x4_t qdst; int32x4_t qdst;
int16x4_t d0s16, d1s16; int16x4_t d0s16, d1s16;
@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
return qdst; return qdst;
} }
void vpx_convolve8_horiz_neon( void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
const uint8_t *src, uint8_t *dst, ptrdiff_t dst_stride,
ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4,
uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x,
int x_step_q4,
const int16_t *filter_y, // unused const int16_t *filter_y, // unused
int y_step_q4, // unused int y_step_q4, // unused
int w, int w, int h) {
int h) {
int width; int width;
const uint8_t *s, *psrc; const uint8_t *s, *psrc;
uint8_t *d, *pdst; uint8_t *d, *pdst;
@ -77,8 +67,7 @@ void vpx_convolve8_horiz_neon(
q0s16 = vld1q_s16(filter_x); q0s16 = vld1q_s16(filter_x);
src -= 3; // adjust for taps src -= 3; // adjust for taps
for (; h > 0; h -= 4, for (; h > 0; h -= 4, src += src_stride * 4,
src += src_stride * 4,
dst += dst_stride * 4) { // loop_horiz_v dst += dst_stride * 4) { // loop_horiz_v
s = src; s = src;
d24u8 = vld1_u8(s); d24u8 = vld1_u8(s);
@ -92,8 +81,8 @@ void vpx_convolve8_horiz_neon(
q12u8 = vcombine_u8(d24u8, d25u8); q12u8 = vcombine_u8(d24u8, d25u8);
q13u8 = vcombine_u8(d26u8, d27u8); q13u8 = vcombine_u8(d26u8, d27u8);
q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), q0x2u16 =
vreinterpretq_u16_u8(q13u8)); vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@ -119,8 +108,7 @@ void vpx_convolve8_horiz_neon(
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
for (width = w, psrc = src + 7, pdst = dst; for (width = w, psrc = src + 7, pdst = dst; width > 0;
width > 0;
width -= 4, psrc += 4, pdst += 4) { // loop_horiz width -= 4, psrc += 4, pdst += 4) { // loop_horiz
s = psrc; s = psrc;
d28u32 = vld1_dup_u32((const uint32_t *)s); d28u32 = vld1_dup_u32((const uint32_t *)s);
@ -133,10 +121,10 @@ void vpx_convolve8_horiz_neon(
__builtin_prefetch(psrc + 64); __builtin_prefetch(psrc + 64);
d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), d0x2u16 =
vreinterpret_u16_u32(d31u32)); vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), d1x2u16 =
vreinterpret_u16_u32(d30u32)); vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
vreinterpret_u8_u16(d1x2u16.val[0])); // d29 vreinterpret_u8_u16(d1x2u16.val[0])); // d29
d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
@ -146,8 +134,8 @@ void vpx_convolve8_horiz_neon(
q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), q0x2u32 =
vreinterpretq_u32_u8(q15u8)); vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@ -166,14 +154,14 @@ void vpx_convolve8_horiz_neon(
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
d18s16, d19s16, d23s16, d24s16, q0s16); d23s16, d24s16, q0s16);
q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
d19s16, d23s16, d24s16, d26s16, q0s16); d24s16, d26s16, q0s16);
q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
d23s16, d24s16, d26s16, d27s16, q0s16); d26s16, d27s16, q0s16);
q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
d24s16, d26s16, d27s16, d25s16, q0s16); d27s16, d25s16, q0s16);
__builtin_prefetch(psrc + 60 + src_stride * 3); __builtin_prefetch(psrc + 60 + src_stride * 3);
@ -188,8 +176,7 @@ void vpx_convolve8_horiz_neon(
d2u8 = vqmovn_u16(q1u16); d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16); d3u8 = vqmovn_u16(q2u16);
d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
vreinterpret_u16_u8(d3u8));
d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
vreinterpret_u32_u16(d0x2u16.val[1])); vreinterpret_u32_u16(d0x2u16.val[1]));
d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@ -217,16 +204,11 @@ void vpx_convolve8_horiz_neon(
return; return;
} }
void vpx_convolve8_vert_neon( void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
const uint8_t *src, uint8_t *dst, ptrdiff_t dst_stride,
ptrdiff_t src_stride,
uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, // unused const int16_t *filter_x, // unused
int x_step_q4, // unused int x_step_q4, // unused
const int16_t *filter_y, const int16_t *filter_y, int y_step_q4, int w,
int y_step_q4,
int w,
int h) { int h) {
int height; int height;
const uint8_t *s; const uint8_t *s;
@ -294,20 +276,20 @@ void vpx_convolve8_vert_neon(
__builtin_prefetch(d); __builtin_prefetch(d);
__builtin_prefetch(d + dst_stride); __builtin_prefetch(d + dst_stride);
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
d20s16, d21s16, d22s16, d24s16, q0s16); d22s16, d24s16, q0s16);
__builtin_prefetch(d + dst_stride * 2); __builtin_prefetch(d + dst_stride * 2);
__builtin_prefetch(d + dst_stride * 3); __builtin_prefetch(d + dst_stride * 3);
q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
d21s16, d22s16, d24s16, d26s16, q0s16); d24s16, d26s16, q0s16);
__builtin_prefetch(s); __builtin_prefetch(s);
__builtin_prefetch(s + src_stride); __builtin_prefetch(s + src_stride);
q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
d22s16, d24s16, d26s16, d27s16, q0s16); d26s16, d27s16, q0s16);
__builtin_prefetch(s + src_stride * 2); __builtin_prefetch(s + src_stride * 2);
__builtin_prefetch(s + src_stride * 3); __builtin_prefetch(s + src_stride * 3);
q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
d24s16, d26s16, d27s16, d25s16, q0s16); d27s16, d25s16, q0s16);
d2u16 = vqrshrun_n_s32(q1s32, 7); d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7);

View File

@ -13,23 +13,21 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
void vpx_convolve_avg_neon( void vpx_convolve_avg_neon(const uint8_t *src, // r0
const uint8_t *src, // r0
ptrdiff_t src_stride, // r1 ptrdiff_t src_stride, // r1
uint8_t *dst, // r2 uint8_t *dst, // r2
ptrdiff_t dst_stride, // r3 ptrdiff_t dst_stride, // r3
const int16_t *filter_x, const int16_t *filter_x, int filter_x_stride,
int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w,
const int16_t *filter_y,
int filter_y_stride,
int w,
int h) { int h) {
uint8_t *d; uint8_t *d;
uint8x8_t d0u8, d1u8, d2u8, d3u8; uint8x8_t d0u8, d1u8, d2u8, d3u8;
uint32x2_t d0u32, d2u32; uint32x2_t d0u32, d2u32;
uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
(void)filter_x; (void)filter_x_stride; (void)filter_x;
(void)filter_y; (void)filter_y_stride; (void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
d = dst; d = dst;
if (w > 32) { // avg64 if (w > 32) { // avg64
@ -133,8 +131,7 @@ void vpx_convolve_avg_neon(
d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
d += dst_stride; d += dst_stride;
d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
vreinterpret_u8_u32(d2u32));
d0u32 = vreinterpret_u32_u8(d0u8); d0u32 = vreinterpret_u32_u8(d0u8);
vst1_lane_u32((uint32_t *)dst, d0u32, 0); vst1_lane_u32((uint32_t *)dst, d0u32, 0);

View File

@ -13,21 +13,19 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
void vpx_convolve_copy_neon( void vpx_convolve_copy_neon(const uint8_t *src, // r0
const uint8_t *src, // r0
ptrdiff_t src_stride, // r1 ptrdiff_t src_stride, // r1
uint8_t *dst, // r2 uint8_t *dst, // r2
ptrdiff_t dst_stride, // r3 ptrdiff_t dst_stride, // r3
const int16_t *filter_x, const int16_t *filter_x, int filter_x_stride,
int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w,
const int16_t *filter_y,
int filter_y_stride,
int w,
int h) { int h) {
uint8x8_t d0u8, d2u8; uint8x8_t d0u8, d2u8;
uint8x16_t q0u8, q1u8, q2u8, q3u8; uint8x16_t q0u8, q1u8, q2u8, q3u8;
(void)filter_x; (void)filter_x_stride; (void)filter_x;
(void)filter_y; (void)filter_y_stride; (void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
if (w > 32) { // copy64 if (w > 32) { // copy64
for (; h > 0; h--) { for (; h > 0; h--) {

View File

@ -14,10 +14,9 @@
#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
uint8_t *dst, ptrdiff_t dst_stride, ptrdiff_t dst_stride, const int16_t *filter_x,
const int16_t *filter_x, int x_step_q4, int x_step_q4, const int16_t *filter_y, int y_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) { int w, int h) {
/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
@ -35,23 +34,20 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
* the temp buffer which has lots of extra room and is subsequently discarded * the temp buffer which has lots of extra room and is subsequently discarded
* this is safe if somewhat less than ideal. * this is safe if somewhat less than ideal.
*/ */
vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
temp, 64, x_step_q4, filter_y, y_step_q4, w,
filter_x, x_step_q4, filter_y, y_step_q4, intermediate_height);
w, intermediate_height);
/* Step into the temp buffer 3 lines to get the actual frame data */ /* Step into the temp buffer 3 lines to get the actual frame data */
vpx_convolve8_vert_neon(temp + 64 * 3, 64, vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
} }
void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
int intermediate_height = h + 7; int intermediate_height = h + 7;
@ -61,12 +57,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
/* This implementation has the same issues as above. In addition, we only want /* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes. * to average the values after both passes.
*/ */
vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
temp, 64, x_step_q4, filter_y, y_step_q4, w,
filter_x, x_step_q4, filter_y, y_step_q4, intermediate_height);
w, intermediate_height); vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
vpx_convolve8_avg_vert_neon(temp + 64 * 3, x_step_q4, filter_y, y_step_q4, w, h);
64, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
} }

View File

@ -15,8 +15,9 @@
unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) { unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
int i, j; int i, j;
int sum = 0; int sum = 0;
for (i = 0; i < 8; ++i, s+=p) for (i = 0; i < 8; ++i, s += p)
for (j = 0; j < 8; sum += s[j], ++j) {} for (j = 0; j < 8; sum += s[j], ++j) {
}
return (sum + 32) >> 6; return (sum + 32) >> 6;
} }
@ -24,8 +25,9 @@ unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) { unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
int i, j; int i, j;
int sum = 0; int sum = 0;
for (i = 0; i < 4; ++i, s+=p) for (i = 0; i < 4; ++i, s += p)
for (j = 0; j < 4; sum += s[j], ++j) {} for (j = 0; j < 4; sum += s[j], ++j) {
}
return (sum + 8) >> 4; return (sum + 8) >> 4;
} }
@ -92,8 +94,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int idx; int idx;
for (idx = 0; idx < 4; ++idx) { for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255] // src_diff: 9 bit, dynamic range [-255, 255]
const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride const int16_t *src_ptr =
+ (idx & 0x01) * 8; src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
} }
@ -123,8 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int vpx_satd_c(const int16_t *coeff, int length) { int vpx_satd_c(const int16_t *coeff, int length) {
int i; int i;
int satd = 0; int satd = 0;
for (i = 0; i < length; ++i) for (i = 0; i < length; ++i) satd += abs(coeff[i]);
satd += abs(coeff[i]);
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
return satd; return satd;
@ -140,8 +141,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
int i; int i;
hbuf[idx] = 0; hbuf[idx] = 0;
// hbuf[idx]: 14 bit, dynamic range [0, 16320]. // hbuf[idx]: 14 bit, dynamic range [0, 16320].
for (i = 0; i < height; ++i) for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
hbuf[idx] += ref[i * ref_stride];
// hbuf[idx]: 9 bit, dynamic range [0, 510]. // hbuf[idx]: 9 bit, dynamic range [0, 510].
hbuf[idx] /= norm_factor; hbuf[idx] /= norm_factor;
++ref; ++ref;
@ -153,16 +153,14 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
int idx; int idx;
int16_t sum = 0; int16_t sum = 0;
// sum: 14 bit, dynamic range [0, 16320] // sum: 14 bit, dynamic range [0, 16320]
for (idx = 0; idx < width; ++idx) for (idx = 0; idx < width; ++idx) sum += ref[idx];
sum += ref[idx];
return sum; return sum;
} }
// ref: [0 - 510] // ref: [0 - 510]
// src: [0 - 510] // src: [0 - 510]
// bwl: {2, 3, 4} // bwl: {2, 3, 4}
int vpx_vector_var_c(const int16_t *ref, const int16_t *src, int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
const int bwl) {
int i; int i;
int width = 4 << bwl; int width = 4 << bwl;
int sse = 0, mean = 0, var; int sse = 0, mean = 0, var;
@ -185,7 +183,7 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
*max = 0; *max = 0;
for (i = 0; i < 8; ++i, s += p, d += dp) { for (i = 0; i < 8; ++i, s += p, d += dp) {
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
int diff = abs(s[j]-d[j]); int diff = abs(s[j] - d[j]);
*min = diff < *min ? diff : *min; *min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max; *max = diff > *max ? diff : *max;
} }
@ -196,9 +194,10 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) { unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
int i, j; int i, j;
int sum = 0; int sum = 0;
const uint16_t* s = CONVERT_TO_SHORTPTR(s8); const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
for (i = 0; i < 8; ++i, s+=p) for (i = 0; i < 8; ++i, s += p)
for (j = 0; j < 8; sum += s[j], ++j) {} for (j = 0; j < 8; sum += s[j], ++j) {
}
return (sum + 32) >> 6; return (sum + 32) >> 6;
} }
@ -206,9 +205,10 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) { unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
int i, j; int i, j;
int sum = 0; int sum = 0;
const uint16_t* s = CONVERT_TO_SHORTPTR(s8); const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
for (i = 0; i < 4; ++i, s+=p) for (i = 0; i < 4; ++i, s += p)
for (j = 0; j < 4; sum += s[j], ++j) {} for (j = 0; j < 4; sum += s[j], ++j) {
}
return (sum + 8) >> 4; return (sum + 8) >> 4;
} }
@ -216,18 +216,16 @@ unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
int dp, int *min, int *max) { int dp, int *min, int *max) {
int i, j; int i, j;
const uint16_t* s = CONVERT_TO_SHORTPTR(s8); const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
const uint16_t* d = CONVERT_TO_SHORTPTR(d8); const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
*min = 255; *min = 255;
*max = 0; *max = 0;
for (i = 0; i < 8; ++i, s += p, d += dp) { for (i = 0; i < 8; ++i, s += p, d += dp) {
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
int diff = abs(s[j]-d[j]); int diff = abs(s[j] - d[j]);
*min = diff < *min ? diff : *min; *min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max; *max = diff > *max ? diff : *max;
} }
} }
} }
#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_HIGHBITDEPTH

View File

@ -18,11 +18,8 @@
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#include "vpx_util/endian_inl.h" #include "vpx_util/endian_inl.h"
int vpx_reader_init(vpx_reader *r, int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
const uint8_t *buffer, vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
size_t size,
vpx_decrypt_cb decrypt_cb,
void *decrypt_state) {
if (size && !buffer) { if (size && !buffer) {
return 1; return 1;
} else { } else {

View File

@ -45,11 +45,8 @@ typedef struct {
uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
} vpx_reader; } vpx_reader;
int vpx_reader_init(vpx_reader *r, int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
const uint8_t *buffer, vpx_decrypt_cb decrypt_cb, void *decrypt_state);
size_t size,
vpx_decrypt_cb decrypt_cb,
void *decrypt_state);
void vpx_reader_fill(vpx_reader *r); void vpx_reader_fill(vpx_reader *r);
@ -81,8 +78,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
unsigned int range; unsigned int range;
unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
if (r->count < 0) if (r->count < 0) vpx_reader_fill(r);
vpx_reader_fill(r);
value = r->value; value = r->value;
count = r->count; count = r->count;
@ -117,8 +113,7 @@ static INLINE int vpx_read_bit(vpx_reader *r) {
static INLINE int vpx_read_literal(vpx_reader *r, int bits) { static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
int literal = 0, bit; int literal = 0, bit;
for (bit = bits - 1; bit >= 0; bit--) for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit;
literal |= vpx_read_bit(r) << bit;
return literal; return literal;
} }
@ -127,8 +122,7 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
const vpx_prob *probs) { const vpx_prob *probs) {
vpx_tree_index i = 0; vpx_tree_index i = 0;
while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue;
continue;
return -i; return -i;
} }

View File

@ -30,23 +30,20 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) { int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
int value = 0, bit; int value = 0, bit;
for (bit = bits - 1; bit >= 0; bit--) for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit;
value |= vpx_rb_read_bit(rb) << bit;
return value; return value;
} }
int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
int bits) {
const int value = vpx_rb_read_literal(rb, bits); const int value = vpx_rb_read_literal(rb, bits);
return vpx_rb_read_bit(rb) ? -value : value; return vpx_rb_read_bit(rb) ? -value : value;
} }
int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
int bits) {
#if CONFIG_MISC_FIXES #if CONFIG_MISC_FIXES
const int nbits = sizeof(unsigned) * 8 - bits - 1; const int nbits = sizeof(unsigned) * 8 - bits - 1;
const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits; const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
return ((int) value) >> nbits; return ((int)value) >> nbits;
#else #else
return vpx_rb_read_signed_literal(rb, bits); return vpx_rb_read_signed_literal(rb, bits);
#endif #endif

View File

@ -24,11 +24,8 @@ void vpx_start_encode(vpx_writer *br, uint8_t *source) {
void vpx_stop_encode(vpx_writer *br) { void vpx_stop_encode(vpx_writer *br) {
int i; int i;
for (i = 0; i < 32; i++) for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
vpx_write_bit(br, 0);
// Ensure there's no ambigous collision with any index marker bytes // Ensure there's no ambigous collision with any index marker bytes
if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
br->buffer[br->pos++] = 0;
} }

View File

@ -85,8 +85,7 @@ static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) { static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
int bit; int bit;
for (bit = bits - 1; bit >= 0; bit--) for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit));
vpx_write_bit(w, 1 & (data >> bit));
} }
#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8) #define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)

View File

@ -22,7 +22,7 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
const int off = (int)wb->bit_offset; const int off = (int)wb->bit_offset;
const int p = off / CHAR_BIT; const int p = off / CHAR_BIT;
const int q = CHAR_BIT - 1 - off % CHAR_BIT; const int q = CHAR_BIT - 1 - off % CHAR_BIT;
if (q == CHAR_BIT -1) { if (q == CHAR_BIT - 1) {
wb->bit_buffer[p] = bit << q; wb->bit_buffer[p] = bit << q;
} else { } else {
wb->bit_buffer[p] &= ~(1 << q); wb->bit_buffer[p] &= ~(1 << q);
@ -33,12 +33,11 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
int bit; int bit;
for (bit = bits - 1; bit >= 0; bit--) for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1);
vpx_wb_write_bit(wb, (data >> bit) & 1);
} }
void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
int data, int bits) { int bits) {
#if CONFIG_MISC_FIXES #if CONFIG_MISC_FIXES
vpx_wb_write_literal(wb, data, bits + 1); vpx_wb_write_literal(wb, data, bits + 1);
#else #else

View File

@ -10,26 +10,32 @@
#include <stdlib.h> #include <stdlib.h>
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, const int16_t vpx_rv[] = {
14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3,
4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0,
4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5,
10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7,
5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1,
11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9,
10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2,
2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6,
1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9,
7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2,
5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3,
0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7,
10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0,
4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12,
13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, }; 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0,
3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12,
3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6,
2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13,
9, 10, 13,
};
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
unsigned char *dst_ptr, unsigned char *dst_ptr,
@ -55,8 +61,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
v = p_src[col]; v = p_src[col];
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
unsigned char k1, k2, k3; unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1; k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1; k2 = (p_below2 + p_below1 + 1) >> 1;
@ -77,10 +83,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
for (col = 0; col < cols; col++) { for (col = 0; col < cols; col++) {
v = p_src[col]; v = p_src[col];
if ((abs(v - p_src[col - 2]) < f[col]) if ((abs(v - p_src[col - 2]) < f[col]) &&
&& (abs(v - p_src[col - 1]) < f[col]) (abs(v - p_src[col - 1]) < f[col]) &&
&& (abs(v - p_src[col + 1]) < f[col]) (abs(v - p_src[col + 1]) < f[col]) &&
&& (abs(v - p_src[col + 2]) < f[col])) { (abs(v - p_src[col + 2]) < f[col])) {
unsigned char k1, k2, k3; unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@ -90,8 +96,7 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
d[col & 3] = v; d[col & 3] = v;
if (col >= 2) if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
p_dst[col - 2] = d[(col - 2) & 3];
} }
/* handle the last two pixels */ /* handle the last two pixels */
@ -115,14 +120,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
int sumsq = 0; int sumsq = 0;
int sum = 0; int sum = 0;
for (i = -8; i < 0; i++) for (i = -8; i < 0; i++) s[i] = s[0];
s[i] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d /* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead... * and only write them when we've read 8 ahead...
*/ */
for (i = 0; i < 17; i++) for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
s[i + cols] = s[cols - 1];
for (i = -8; i <= 6; i++) { for (i = -8; i <= 6; i++) {
sumsq += s[i] * s[i]; sumsq += s[i] * s[i];
@ -162,14 +165,12 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
unsigned char d[16]; unsigned char d[16];
const int16_t *rv2 = rv3 + ((c * 17) & 127); const int16_t *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i < 0; i++) for (i = -8; i < 0; i++) s[i * pitch] = s[0];
s[i * pitch] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d /* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead... * and only write them when we've read 8 ahead...
*/ */
for (i = 0; i < 17; i++) for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
s[(i + rows) * pitch] = s[(rows - 1) * pitch];
for (i = -8; i <= 6; i++) { for (i = -8; i <= 6; i++) {
sumsq += s[i * pitch] * s[i * pitch]; sumsq += s[i * pitch] * s[i * pitch];
@ -184,16 +185,14 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
if (sumsq * 15 - sum * sum < flimit) { if (sumsq * 15 - sum * sum < flimit) {
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
} }
if (r >= 8) if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
s[-8 * pitch] = d[(r - 8) & 15];
s += pitch; s += pitch;
} }
} }
} }
#if CONFIG_POSTPROC #if CONFIG_POSTPROC
static void vpx_de_mblock(YV12_BUFFER_CONFIG *post, static void vpx_de_mblock(YV12_BUFFER_CONFIG *post, int q) {
int q) {
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q)); post->y_width, q2mbl(q));
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,

View File

@ -55,12 +55,12 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
int l; int l;
lw = (_w + 1) >> 1; lw = (_w + 1) >> 1;
lh = (_h + 1) >> 1; lh = (_h + 1) >> 1;
data_size = _nlevels * sizeof(fs_level) data_size =
+ 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
for (l = 0; l < _nlevels; l++) { for (l = 0; l < _nlevels; l++) {
size_t im_size; size_t im_size;
size_t level_size; size_t level_size;
im_size = lw * (size_t) lh; im_size = lw * (size_t)lh;
level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
level_size += sizeof(*_ctx->level[l].ssim) - 1; level_size += sizeof(*_ctx->level[l].ssim) - 1;
level_size /= sizeof(*_ctx->level[l].ssim); level_size /= sizeof(*_ctx->level[l].ssim);
@ -70,8 +70,8 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
lw = (lw + 1) >> 1; lw = (lw + 1) >> 1;
lh = (lh + 1) >> 1; lh = (lh + 1) >> 1;
} }
data = (unsigned char *) malloc(data_size); data = (unsigned char *)malloc(data_size);
_ctx->level = (fs_level *) data; _ctx->level = (fs_level *)data;
_ctx->nlevels = _nlevels; _ctx->nlevels = _nlevels;
data += _nlevels * sizeof(*_ctx->level); data += _nlevels * sizeof(*_ctx->level);
lw = (_w + 1) >> 1; lw = (_w + 1) >> 1;
@ -81,7 +81,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
size_t level_size; size_t level_size;
_ctx->level[l].w = lw; _ctx->level[l].w = lw;
_ctx->level[l].h = lh; _ctx->level[l].h = lh;
im_size = lw * (size_t) lh; im_size = lw * (size_t)lh;
level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
level_size += sizeof(*_ctx->level[l].ssim) - 1; level_size += sizeof(*_ctx->level[l].ssim) - 1;
level_size /= sizeof(*_ctx->level[l].ssim); level_size /= sizeof(*_ctx->level[l].ssim);
@ -89,17 +89,15 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
_ctx->level[l].im1 = (uint32_t *)data; _ctx->level[l].im1 = (uint32_t *)data;
_ctx->level[l].im2 = _ctx->level[l].im1 + im_size; _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
data += level_size; data += level_size;
_ctx->level[l].ssim = (double *) data; _ctx->level[l].ssim = (double *)data;
data += im_size * sizeof(*_ctx->level[l].ssim); data += im_size * sizeof(*_ctx->level[l].ssim);
lw = (lw + 1) >> 1; lw = (lw + 1) >> 1;
lh = (lh + 1) >> 1; lh = (lh + 1) >> 1;
} }
_ctx->col_buf = (unsigned *) data; _ctx->col_buf = (unsigned *)data;
} }
static void fs_ctx_clear(fs_ctx *_ctx) { static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
free(_ctx->level);
}
static void fs_downsample_level(fs_ctx *_ctx, int _l) { static void fs_downsample_level(fs_ctx *_ctx, int _l) {
const uint32_t *src1; const uint32_t *src1;
@ -130,18 +128,18 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
int i1; int i1;
i0 = 2 * i; i0 = 2 * i;
i1 = FS_MINI(i0 + 1, w2); i1 = FS_MINI(i0 + 1, w2);
dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+ src1[j1offs + i0] + src1[j1offs + i1]; src1[j1offs + i0] + src1[j1offs + i1];
dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+ src2[j1offs + i0] + src2[j1offs + i1]; src2[j1offs + i0] + src2[j1offs + i1];
} }
} }
} }
static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
int _s1ystride, const uint8_t *_src2, int _s1ystride, const uint8_t *_src2,
int _s2ystride, int _w, int _h, int _s2ystride, int _w, int _h, uint32_t bd,
uint32_t bd, uint32_t shift) { uint32_t shift) {
uint32_t *dst1; uint32_t *dst1;
uint32_t *dst2; uint32_t *dst2;
int w; int w;
@ -163,23 +161,23 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
i0 = 2 * i; i0 = 2 * i;
i1 = FS_MINI(i0 + 1, _w); i1 = FS_MINI(i0 + 1, _w);
if (bd == 8 && shift == 0) { if (bd == 8 && shift == 0) {
dst1[j * w + i] = _src1[j0 * _s1ystride + i0] dst1[j * w + i] =
+ _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0] _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+ _src1[j1 * _s1ystride + i1]; _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
dst2[j * w + i] = _src2[j0 * _s2ystride + i0] dst2[j * w + i] =
+ _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0] _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+ _src2[j1 * _s2ystride + i1]; _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
} else { } else {
uint16_t * src1s = CONVERT_TO_SHORTPTR(_src1); uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
uint16_t * src2s = CONVERT_TO_SHORTPTR(_src2); uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+ (src1s[j0 * _s1ystride + i1] >> shift) (src1s[j0 * _s1ystride + i1] >> shift) +
+ (src1s[j1 * _s1ystride + i0] >> shift) (src1s[j1 * _s1ystride + i0] >> shift) +
+ (src1s[j1 * _s1ystride + i1] >> shift); (src1s[j1 * _s1ystride + i1] >> shift);
dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+ (src2s[j0 * _s2ystride + i1] >> shift) (src2s[j0 * _s2ystride + i1] >> shift) +
+ (src2s[j1 * _s2ystride + i0] >> shift) (src2s[j1 * _s2ystride + i0] >> shift) +
+ (src2s[j1 * _s2ystride + i1] >> shift); (src2s[j1 * _s2ystride + i1] >> shift);
} }
} }
} }
@ -200,10 +198,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
int j; int j;
double ssim_c1 = SSIM_C1; double ssim_c1 = SSIM_C1;
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
if (bit_depth == 10) if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
ssim_c1 = SSIM_C1_10; if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
if (bit_depth == 12)
ssim_c1 = SSIM_C1_12;
#else #else
assert(bit_depth == 8); assert(bit_depth == 8);
#endif #endif
@ -213,19 +209,15 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
col_sums_y = col_sums_x + w; col_sums_y = col_sums_x + w;
im1 = _ctx->level[_l].im1; im1 = _ctx->level[_l].im1;
im2 = _ctx->level[_l].im2; im2 = _ctx->level[_l].im2;
for (i = 0; i < w; i++) for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
col_sums_x[i] = 5 * im1[i]; for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
for (i = 0; i < w; i++)
col_sums_y[i] = 5 * im2[i];
for (j = 1; j < 4; j++) { for (j = 1; j < 4; j++) {
j1offs = FS_MINI(j, h - 1) * w; j1offs = FS_MINI(j, h - 1) * w;
for (i = 0; i < w; i++) for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
col_sums_x[i] += im1[j1offs + i]; for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
for (i = 0; i < w; i++)
col_sums_y[i] += im2[j1offs + i];
} }
ssim = _ctx->level[_l].ssim; ssim = _ctx->level[_l].ssim;
c1 = (double) (ssim_c1 * 4096 * (1 << 4 * _l)); c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
for (j = 0; j < h; j++) { for (j = 0; j < h; j++) {
unsigned mux; unsigned mux;
unsigned muy; unsigned muy;
@ -239,8 +231,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
muy += col_sums_y[i1]; muy += col_sums_y[i1];
} }
for (i = 0; i < w; i++) { for (i = 0; i < w; i++) {
ssim[j * w + i] *= (2 * mux * (double) muy + c1) ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
/ (mux * (double) mux + muy * (double) muy + c1); (mux * (double)mux + muy * (double)muy + c1);
if (i + 1 < w) { if (i + 1 < w) {
i0 = FS_MAXI(0, i - 4); i0 = FS_MAXI(0, i - 4);
i1 = FS_MINI(i + 4, w - 1); i1 = FS_MINI(i + 4, w - 1);
@ -250,15 +242,11 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
} }
if (j + 1 < h) { if (j + 1 < h) {
j0offs = FS_MAXI(0, j - 4) * w; j0offs = FS_MAXI(0, j - 4) * w;
for (i = 0; i < w; i++) for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
col_sums_x[i] -= im1[j0offs + i]; for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
for (i = 0; i < w; i++)
col_sums_y[i] -= im2[j0offs + i];
j1offs = FS_MINI(j + 4, h - 1) * w; j1offs = FS_MINI(j + 4, h - 1) * w;
for (i = 0; i < w; i++) for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
col_sums_x[i] += im1[j1offs + i]; for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
for (i = 0; i < w; i++)
col_sums_y[i] += im2[j1offs + i];
} }
} }
} }
@ -272,8 +260,7 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
col_sums_gx2[(_col)] = gx * (double)gx; \ col_sums_gx2[(_col)] = gx * (double)gx; \
col_sums_gy2[(_col)] = gy * (double)gy; \ col_sums_gy2[(_col)] = gy * (double)gy; \
col_sums_gxgy[(_col)] = gx * (double)gy; \ col_sums_gxgy[(_col)] = gx * (double)gy; \
} \ } while (0)
while (0)
#define FS_COL_ADD(_col, _joffs, _ioffs) \ #define FS_COL_ADD(_col, _joffs, _ioffs) \
do { \ do { \
@ -284,8 +271,7 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
col_sums_gx2[(_col)] += gx * (double)gx; \ col_sums_gx2[(_col)] += gx * (double)gx; \
col_sums_gy2[(_col)] += gy * (double)gy; \ col_sums_gy2[(_col)] += gy * (double)gy; \
col_sums_gxgy[(_col)] += gx * (double)gy; \ col_sums_gxgy[(_col)] += gx * (double)gy; \
} \ } while (0)
while (0)
#define FS_COL_SUB(_col, _joffs, _ioffs) \ #define FS_COL_SUB(_col, _joffs, _ioffs) \
do { \ do { \
@ -296,32 +282,28 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
col_sums_gx2[(_col)] -= gx * (double)gx; \ col_sums_gx2[(_col)] -= gx * (double)gx; \
col_sums_gy2[(_col)] -= gy * (double)gy; \ col_sums_gy2[(_col)] -= gy * (double)gy; \
col_sums_gxgy[(_col)] -= gx * (double)gy; \ col_sums_gxgy[(_col)] -= gx * (double)gy; \
} \ } while (0)
while (0)
#define FS_COL_COPY(_col1, _col2) \ #define FS_COL_COPY(_col1, _col2) \
do { \ do { \
col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
} \ } while (0)
while (0)
#define FS_COL_HALVE(_col1, _col2) \ #define FS_COL_HALVE(_col1, _col2) \
do { \ do { \
col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
} \ } while (0)
while (0)
#define FS_COL_DOUBLE(_col1, _col2) \ #define FS_COL_DOUBLE(_col1, _col2) \
do { \ do { \
col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
} \ } while (0)
while (0)
static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
uint32_t *im1; uint32_t *im1;
@ -340,10 +322,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
int j; int j;
double ssim_c2 = SSIM_C2; double ssim_c2 = SSIM_C2;
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
if (bit_depth == 10) if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
ssim_c2 = SSIM_C2_10; if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
if (bit_depth == 12)
ssim_c2 = SSIM_C2_12;
#else #else
assert(bit_depth == 8); assert(bit_depth == 8);
#endif #endif
@ -398,14 +378,11 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
double mugy2; double mugy2;
double mugxgy; double mugxgy;
mugx2 = col_sums_gx2[0]; mugx2 = col_sums_gx2[0];
for (k = 1; k < 8; k++) for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
mugx2 += col_sums_gx2[k];
mugy2 = col_sums_gy2[0]; mugy2 = col_sums_gy2[0];
for (k = 1; k < 8; k++) for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
mugy2 += col_sums_gy2[k];
mugxgy = col_sums_gxgy[0]; mugxgy = col_sums_gxgy[0];
for (k = 1; k < 8; k++) for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
mugxgy += col_sums_gxgy[k];
ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
if (i + 1 < w) { if (i + 1 < w) {
FS_COL_SET(0, -1, 1); FS_COL_SET(0, -1, 1);
@ -440,8 +417,9 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
We drop the finest scale and renormalize the rest to sum to 1.*/ We drop the finest scale and renormalize the rest to sum to 1.*/
static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625, static const double FS_WEIGHTS[FS_NLEVELS] = {
0.3141326904296875, 0.2473602294921875, 0.1395416259765625}; 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
};
static double fs_average(fs_ctx *_ctx, int _l) { static double fs_average(fs_ctx *_ctx, int _l) {
double *ssim; double *ssim;
@ -455,28 +433,26 @@ static double fs_average(fs_ctx *_ctx, int _l) {
ssim = _ctx->level[_l].ssim; ssim = _ctx->level[_l].ssim;
ret = 0; ret = 0;
for (j = 0; j < h; j++) for (j = 0; j < h; j++)
for (i = 0; i < w; i++) for (i = 0; i < w; i++) ret += ssim[j * w + i];
ret += ssim[j * w + i];
return pow(ret / (w * h), FS_WEIGHTS[_l]); return pow(ret / (w * h), FS_WEIGHTS[_l]);
} }
static double convert_ssim_db(double _ssim, double _weight) { static double convert_ssim_db(double _ssim, double _weight) {
assert(_weight >= _ssim); assert(_weight >= _ssim);
if ((_weight - _ssim) < 1e-10) if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
return MAX_SSIM_DB;
return 10 * (log10(_weight) - log10(_weight - _ssim)); return 10 * (log10(_weight) - log10(_weight - _ssim));
} }
static double calc_ssim(const uint8_t *_src, int _systride, static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
const uint8_t *_dst, int _dystride, int _dystride, int _w, int _h, uint32_t _bd,
int _w, int _h, uint32_t _bd, uint32_t _shift) { uint32_t _shift) {
fs_ctx ctx; fs_ctx ctx;
double ret; double ret;
int l; int l;
ret = 1; ret = 1;
fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
_w, _h, _bd, _shift); _shift);
for (l = 0; l < FS_NLEVELS - 1; l++) { for (l = 0; l < FS_NLEVELS - 1; l++) {
fs_calc_structure(&ctx, l, _bd); fs_calc_structure(&ctx, l, _bd);
ret *= fs_average(&ctx, l); ret *= fs_average(&ctx, l);
@ -490,9 +466,9 @@ static double calc_ssim(const uint8_t *_src, int _systride,
} }
double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source, double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
const YV12_BUFFER_CONFIG *dest, const YV12_BUFFER_CONFIG *dest, double *ssim_y,
double *ssim_y, double *ssim_u, double *ssim_v, double *ssim_u, double *ssim_v, uint32_t bd,
uint32_t bd, uint32_t in_bd) { uint32_t in_bd) {
double ssimv; double ssimv;
uint32_t bd_shift = 0; uint32_t bd_shift = 0;
vpx_clear_system_state(); vpx_clear_system_state();

View File

@ -72,8 +72,7 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
{ {
int i, j; int i, j;
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
} }
} }
} }
@ -82,8 +81,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c; int r, c;
tran_low_t sum = 0; tran_low_t sum = 0;
for (r = 0; r < 4; ++r) for (r = 0; r < 4; ++r)
for (c = 0; c < 4; ++c) for (c = 0; c < 4; ++c) sum += input[r * stride + c];
sum += input[r * stride + c];
output[0] = sum << 1; output[0] = sum << 1;
} }
@ -169,8 +167,7 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
// Rows // Rows
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
final_output[j + i * 8] /= 2;
} }
} }
@ -178,8 +175,7 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c; int r, c;
tran_low_t sum = 0; tran_low_t sum = 0;
for (r = 0; r < 8; ++r) for (r = 0; r < 8; ++r)
for (c = 0; c < 8; ++c) for (c = 0; c < 8; ++c) sum += input[r * stride + c];
sum += input[r * stride + c];
output[0] = sum; output[0] = sum;
} }
@ -214,11 +210,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4;
input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4;
// Calculate input for the next 8 results. // Calculate input for the next 8 results.
step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4;
step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4;
step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
@ -233,11 +229,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2);
input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2);
// Calculate input for the next 8 results. // Calculate input for the next 8 results.
step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2);
step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2);
step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
@ -368,8 +364,7 @@ void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c; int r, c;
int sum = 0; int sum = 0;
for (r = 0; r < 16; ++r) for (r = 0; r < 16; ++r)
for (c = 0; c < 16; ++c) for (c = 0; c < 16; ++c) sum += input[r * stride + c];
sum += input[r * stride + c];
output[0] = (tran_low_t)(sum >> 1); output[0] = (tran_low_t)(sum >> 1);
} }
@ -717,8 +712,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32]; tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
temp_in[j] = input[j * stride + i] * 4;
vpx_fdct32(temp_in, temp_out, 0); vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@ -727,8 +721,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
// Rows // Rows
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32]; tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
temp_in[j] = output[j + i * 32];
vpx_fdct32(temp_in, temp_out, 0); vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j)
out[j + i * 32] = out[j + i * 32] =
@ -746,8 +739,7 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32]; tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
temp_in[j] = input[j * stride + i] * 4;
vpx_fdct32(temp_in, temp_out, 0); vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing // TODO(cd): see quality impact of only doing
@ -759,11 +751,9 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
// Rows // Rows
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32]; tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
temp_in[j] = output[j + i * 32];
vpx_fdct32(temp_in, temp_out, 1); vpx_fdct32(temp_in, temp_out, 1);
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
out[j + i * 32] = (tran_low_t)temp_out[j];
} }
} }
@ -771,8 +761,7 @@ void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c; int r, c;
int sum = 0; int sum = 0;
for (r = 0; r < 32; ++r) for (r = 0; r < 32; ++r)
for (c = 0; c < 32; ++c) for (c = 0; c < 32; ++c) sum += input[r * stride + c];
sum += input[r * stride + c];
output[0] = (tran_low_t)(sum >> 3); output[0] = (tran_low_t)(sum >> 3);
} }

View File

@ -14,17 +14,16 @@
#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#define DST(x, y) dst[(x) + (y) * stride] #define DST(x, y) dst[(x) + (y)*stride]
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1) #define AVG2(a, b) (((a) + (b) + 1) >> 1)
static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r, c; int r, c;
(void) above; (void)above;
// first column // first column
for (r = 0; r < bs - 1; ++r) for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
dst[r * stride] = AVG2(left[r], left[r + 1]);
dst[(bs - 1) * stride] = left[bs - 1]; dst[(bs - 1) * stride] = left[bs - 1];
dst++; dst++;
@ -36,8 +35,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
dst++; dst++;
// rest of last row // rest of last row
for (c = 0; c < bs - 2; ++c) for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
dst[(bs - 1) * stride + c] = left[bs - 1];
for (r = bs - 2; r >= 0; --r) for (r = bs - 2; r >= 0; --r)
for (c = 0; c < bs - 2; ++c) for (c = 0; c < bs - 2; ++c)
@ -48,7 +46,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r, c; int r, c;
(void) above; (void)above;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
@ -82,7 +80,7 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r, c; int r, c;
(void) left; (void)left;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
@ -117,7 +115,7 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r, c; int r, c;
(void) left; (void)left;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
dst[c] = AVG3(above[r + c], above[r + c + 1], dst[c] = AVG3(above[r + c], above[r + c + 1],
@ -133,14 +131,12 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
int r, c; int r, c;
// first row // first row
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
dst[c] = AVG2(above[c - 1], above[c]);
dst += stride; dst += stride;
// second row // second row
dst[0] = AVG3(left[0], above[-1], above[0]); dst[0] = AVG3(left[0], above[-1], above[0]);
for (c = 1; c < bs; c++) for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst += stride; dst += stride;
// the rest of first col // the rest of first col
@ -150,8 +146,7 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
// the rest of the block // the rest of the block
for (r = 2; r < bs; ++r) { for (r = 2; r < bs; ++r) {
for (c = 1; c < bs; c++) for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
dst[c] = dst[-2 * stride + c - 1];
dst += stride; dst += stride;
} }
} }
@ -188,8 +183,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r, c; int r, c;
dst[0] = AVG2(above[-1], left[0]); dst[0] = AVG2(above[-1], left[0]);
for (r = 1; r < bs; r++) for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
dst[r * stride] = AVG2(left[r - 1], left[r]);
dst++; dst++;
dst[0] = AVG3(left[0], above[-1], above[0]); dst[0] = AVG3(left[0], above[-1], above[0]);
@ -203,8 +197,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
dst += stride; dst += stride;
for (r = 1; r < bs; ++r) { for (r = 1; r < bs; ++r) {
for (c = 0; c < bs - 2; c++) for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
dst[c] = dst[-stride + c - 2];
dst += stride; dst += stride;
} }
} }
@ -212,7 +205,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r; int r;
(void) left; (void)left;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
memcpy(dst, above, bs); memcpy(dst, above, bs);
@ -223,7 +216,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r; int r;
(void) above; (void)above;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
memset(dst, left[r], bs); memset(dst, left[r], bs);
@ -246,8 +239,8 @@ static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int r; int r;
(void) above; (void)above;
(void) left; (void)left;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
memset(dst, 128, bs); memset(dst, 128, bs);
@ -259,10 +252,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *above,
const uint8_t *left) { const uint8_t *left) {
int i, r, expected_dc, sum = 0; int i, r, expected_dc, sum = 0;
(void) above; (void)above;
for (i = 0; i < bs; i++) for (i = 0; i < bs; i++) sum += left[i];
sum += left[i];
expected_dc = (sum + (bs >> 1)) / bs; expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
@ -274,10 +266,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int i, r, expected_dc, sum = 0; int i, r, expected_dc, sum = 0;
(void) left; (void)left;
for (i = 0; i < bs; i++) for (i = 0; i < bs; i++) sum += above[i];
sum += above[i];
expected_dc = (sum + (bs >> 1)) / bs; expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
@ -350,8 +341,7 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
DST(1, 0) = AVG3(I, J, K); DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L); DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L); DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) = DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
} }
void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@ -516,8 +506,8 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) above; (void)above;
(void) bd; (void)bd;
// First column. // First column.
for (r = 0; r < bs - 1; ++r) { for (r = 0; r < bs - 1; ++r) {
@ -535,8 +525,7 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
dst++; dst++;
// Rest of last row. // Rest of last row.
for (c = 0; c < bs - 2; ++c) for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
dst[(bs - 1) * stride + c] = left[bs - 1];
for (r = bs - 2; r >= 0; --r) { for (r = bs - 2; r >= 0; --r) {
for (c = 0; c < bs - 2; ++c) for (c = 0; c < bs - 2; ++c)
@ -549,8 +538,8 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) above; (void)above;
(void) bd; (void)bd;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
@ -563,12 +552,12 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
} }
#endif // CONFIG_MISC_FIXES #endif // CONFIG_MISC_FIXES
static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
int bs, const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) left; (void)left;
(void) bd; (void)bd;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
@ -585,12 +574,12 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) left; (void)left;
(void) bd; (void)bd;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1], dst[c] = r + c + 2 < bs * 2
above[r + c + 2]) ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
: above[bs * 2 - 1]; : above[bs * 2 - 1];
} }
dst += stride; dst += stride;
@ -602,8 +591,8 @@ static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) left; (void)left;
(void) bd; (void)bd;
for (r = 0; r < bs; ++r) { for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) { for (c = 0; c < bs; ++c) {
dst[c] = AVG3(above[r + c], above[r + c + 1], dst[c] = AVG3(above[r + c], above[r + c + 1],
@ -618,17 +607,15 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) bd; (void)bd;
// first row // first row
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
dst[c] = AVG2(above[c - 1], above[c]);
dst += stride; dst += stride;
// second row // second row
dst[0] = AVG3(left[0], above[-1], above[0]); dst[0] = AVG3(left[0], above[-1], above[0]);
for (c = 1; c < bs; c++) for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst += stride; dst += stride;
// the rest of first col // the rest of first col
@ -638,8 +625,7 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
// the rest of the block // the rest of the block
for (r = 2; r < bs; ++r) { for (r = 2; r < bs; ++r) {
for (c = 1; c < bs; c++) for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
dst[c] = dst[-2 * stride + c - 1];
dst += stride; dst += stride;
} }
} }
@ -648,10 +634,9 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) bd; (void)bd;
dst[0] = AVG3(left[0], above[-1], above[0]); dst[0] = AVG3(left[0], above[-1], above[0]);
for (c = 1; c < bs; c++) for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
dst[stride] = AVG3(above[-1], left[0], left[1]); dst[stride] = AVG3(above[-1], left[0], left[1]);
for (r = 2; r < bs; ++r) for (r = 2; r < bs; ++r)
@ -659,8 +644,7 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
dst += stride; dst += stride;
for (r = 1; r < bs; ++r) { for (r = 1; r < bs; ++r) {
for (c = 1; c < bs; c++) for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
dst[c] = dst[-stride + c - 1];
dst += stride; dst += stride;
} }
} }
@ -669,10 +653,9 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
(void) bd; (void)bd;
dst[0] = AVG2(above[-1], left[0]); dst[0] = AVG2(above[-1], left[0]);
for (r = 1; r < bs; r++) for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
dst[r * stride] = AVG2(left[r - 1], left[r]);
dst++; dst++;
dst[0] = AVG3(left[0], above[-1], above[0]); dst[0] = AVG3(left[0], above[-1], above[0]);
@ -686,42 +669,41 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
dst += stride; dst += stride;
for (r = 1; r < bs; ++r) { for (r = 1; r < bs; ++r) {
for (c = 0; c < bs - 2; c++) for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
dst[c] = dst[-stride + c - 2];
dst += stride; dst += stride;
} }
} }
static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
int bs, const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r; int r;
(void) left; (void)left;
(void) bd; (void)bd;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
memcpy(dst, above, bs * sizeof(uint16_t)); memcpy(dst, above, bs * sizeof(uint16_t));
dst += stride; dst += stride;
} }
} }
static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
int bs, const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r; int r;
(void) above; (void)above;
(void) bd; (void)bd;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
vpx_memset16(dst, left[r], bs); vpx_memset16(dst, left[r], bs);
dst += stride; dst += stride;
} }
} }
static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
int bs, const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r, c; int r, c;
int ytop_left = above[-1]; int ytop_left = above[-1];
(void) bd; (void)bd;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++)
@ -734,8 +716,8 @@ static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int r; int r;
(void) above; (void)above;
(void) left; (void)left;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
vpx_memset16(dst, 128 << (bd - 8), bs); vpx_memset16(dst, 128 << (bd - 8), bs);
@ -747,11 +729,10 @@ static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int i, r, expected_dc, sum = 0; int i, r, expected_dc, sum = 0;
(void) above; (void)above;
(void) bd; (void)bd;
for (i = 0; i < bs; i++) for (i = 0; i < bs; i++) sum += left[i];
sum += left[i];
expected_dc = (sum + (bs >> 1)) / bs; expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
@ -764,11 +745,10 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above, int bs, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int i, r, expected_dc, sum = 0; int i, r, expected_dc, sum = 0;
(void) left; (void)left;
(void) bd; (void)bd;
for (i = 0; i < bs; i++) for (i = 0; i < bs; i++) sum += above[i];
sum += above[i];
expected_dc = (sum + (bs >> 1)) / bs; expected_dc = (sum + (bs >> 1)) / bs;
for (r = 0; r < bs; r++) { for (r = 0; r < bs; r++) {
@ -777,12 +757,12 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
} }
} }
static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
int bs, const uint16_t *above, const uint16_t *above,
const uint16_t *left, int bd) { const uint16_t *left, int bd) {
int i, r, expected_dc, sum = 0; int i, r, expected_dc, sum = 0;
const int count = 2 * bs; const int count = 2 * bs;
(void) bd; (void)bd;
for (i = 0; i < bs; i++) { for (i = 0; i < bs; i++) {
sum += above[i]; sum += above[i];
@ -802,9 +782,8 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
// can be unified and accessed as a pointer array. Note that the boundary // can be unified and accessed as a pointer array. Note that the boundary
// above and left are not necessarily used all the time. // above and left are not necessarily used all the time.
#define intra_pred_sized(type, size) \ #define intra_pred_sized(type, size) \
void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ void vpx_##type##_predictor_##size##x##size##_c( \
ptrdiff_t stride, \ uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
const uint8_t *above, \
const uint8_t *left) { \ const uint8_t *left) { \
type##_predictor(dst, stride, size, above, left); \ type##_predictor(dst, stride, size, above, left); \
} }
@ -817,6 +796,7 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
highbd_##type##_predictor(dst, stride, size, above, left, bd); \ highbd_##type##_predictor(dst, stride, size, above, left, bd); \
} }
/* clang-format off */
#define intra_pred_allsizes(type) \ #define intra_pred_allsizes(type) \
intra_pred_sized(type, 4) \ intra_pred_sized(type, 4) \
intra_pred_sized(type, 8) \ intra_pred_sized(type, 8) \
@ -867,4 +847,5 @@ intra_pred_allsizes(dc_128)
intra_pred_allsizes(dc_left) intra_pred_allsizes(dc_left)
intra_pred_allsizes(dc_top) intra_pred_allsizes(dc_top)
intra_pred_allsizes(dc) intra_pred_allsizes(dc)
/* clang-format on */
#undef intra_pred_allsizes #undef intra_pred_allsizes

View File

@ -15,7 +15,7 @@
#include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/inv_txfm.h"
void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */ 0.5 shifts per pixel. */
int i; int i;
tran_low_t output[16]; tran_low_t output[16];
@ -127,8 +127,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
// Columns // Columns
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
temp_in[j] = out[j * 4 + i];
idct4_c(temp_in, temp_out); idct4_c(temp_in, temp_out);
for (j = 0; j < 4; ++j) { for (j = 0; j < 4; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -223,8 +222,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
// Then transform columns // Then transform columns
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
temp_in[j] = out[j * 8 + i];
idct8_c(temp_in, temp_out); idct8_c(temp_in, temp_out);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -240,8 +238,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 5); a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
dest[i] = clip_pixel_add(dest[i], a1);
dest += stride; dest += stride;
} }
} }
@ -296,8 +293,8 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t x7 = input[6]; tran_high_t x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
output[0] = output[1] = output[2] = output[3] = output[4] output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
= output[5] = output[6] = output[7] = 0; output[6] = output[7] = 0;
return; return;
} }
@ -376,8 +373,7 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
// Then transform columns // Then transform columns
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
temp_in[j] = out[j * 8 + i];
idct8_c(temp_in, temp_out); idct8_c(temp_in, temp_out);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -391,22 +387,22 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t temp1, temp2; tran_high_t temp1, temp2;
// stage 1 // stage 1
step1[0] = input[0/2]; step1[0] = input[0 / 2];
step1[1] = input[16/2]; step1[1] = input[16 / 2];
step1[2] = input[8/2]; step1[2] = input[8 / 2];
step1[3] = input[24/2]; step1[3] = input[24 / 2];
step1[4] = input[4/2]; step1[4] = input[4 / 2];
step1[5] = input[20/2]; step1[5] = input[20 / 2];
step1[6] = input[12/2]; step1[6] = input[12 / 2];
step1[7] = input[28/2]; step1[7] = input[28 / 2];
step1[8] = input[2/2]; step1[8] = input[2 / 2];
step1[9] = input[18/2]; step1[9] = input[18 / 2];
step1[10] = input[10/2]; step1[10] = input[10 / 2];
step1[11] = input[26/2]; step1[11] = input[26 / 2];
step1[12] = input[6/2]; step1[12] = input[6 / 2];
step1[13] = input[22/2]; step1[13] = input[22 / 2];
step1[14] = input[14/2]; step1[14] = input[14 / 2];
step1[15] = input[30/2]; step1[15] = input[30 / 2];
// stage 2 // stage 2
step2[0] = step1[0]; step2[0] = step1[0];
@ -567,8 +563,7 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
// Then transform columns // Then transform columns
for (i = 0; i < 16; ++i) { for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
temp_in[j] = out[j * 16 + i];
idct16_c(temp_in, temp_out); idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -598,12 +593,11 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t x14 = input[1]; tran_high_t x14 = input[1];
tran_high_t x15 = input[14]; tran_high_t x15 = input[14];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) { x13 | x14 | x15)) {
output[0] = output[1] = output[2] = output[3] = output[4] output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
= output[5] = output[6] = output[7] = output[8] output[6] = output[7] = output[8] = output[9] = output[10] =
= output[9] = output[10] = output[11] = output[12] output[11] = output[12] = output[13] = output[14] = output[15] = 0;
= output[13] = output[14] = output[15] = 0;
return; return;
} }
@ -655,9 +649,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
s9 = x8 * cospi_28_64 - x9 * cospi_4_64; s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
s10 = x10 * cospi_20_64 + x11 * cospi_12_64; s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
s11 = x10 * cospi_12_64 - x11 * cospi_20_64; s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
s13 = x12 * cospi_4_64 + x13 * cospi_28_64; s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
s15 = x14 * cospi_20_64 + x15 * cospi_12_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
x0 = WRAPLOW(s0 + s4); x0 = WRAPLOW(s0 + s4);
@ -684,7 +678,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
s3 = x3; s3 = x3;
s4 = x4 * cospi_8_64 + x5 * cospi_24_64; s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
s5 = x4 * cospi_24_64 - x5 * cospi_8_64; s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
s7 = x6 * cospi_8_64 + x7 * cospi_24_64; s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
s8 = x8; s8 = x8;
s9 = x9; s9 = x9;
@ -692,7 +686,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
s11 = x11; s11 = x11;
s12 = x12 * cospi_8_64 + x13 * cospi_24_64; s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
s13 = x12 * cospi_24_64 - x13 * cospi_8_64; s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
s15 = x14 * cospi_8_64 + x15 * cospi_24_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = WRAPLOW(s0 + s2); x0 = WRAPLOW(s0 + s2);
@ -713,13 +707,13 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
// stage 4 // stage 4
s2 = (- cospi_16_64) * (x2 + x3); s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3); s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7); s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (- x6 + x7); s7 = cospi_16_64 * (-x6 + x7);
s10 = cospi_16_64 * (x10 + x11); s10 = cospi_16_64 * (x10 + x11);
s11 = cospi_16_64 * (- x10 + x11); s11 = cospi_16_64 * (-x10 + x11);
s14 = (- cospi_16_64) * (x14 + x15); s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15); s15 = cospi_16_64 * (x14 - x15);
x2 = WRAPLOW(dct_const_round_shift(s2)); x2 = WRAPLOW(dct_const_round_shift(s2));
@ -766,8 +760,7 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
// Then transform columns // Then transform columns
for (i = 0; i < 16; ++i) { for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
temp_in[j] = out[j*16 + i];
idct16_c(temp_in, temp_out); idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -783,8 +776,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 6); a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i) for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
dest[i] = clip_pixel_add(dest[i], a1);
dest += stride; dest += stride;
} }
} }
@ -1166,8 +1158,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
// Rows // Rows
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
int16_t zero_coeff[16]; int16_t zero_coeff[16];
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
zero_coeff[j] = input[2 * j] | input[2 * j + 1];
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j)
@ -1185,8 +1176,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out); idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -1197,7 +1187,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
tran_low_t out[32 * 32] = {0}; tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out; tran_low_t *outptr = out;
int i, j; int i, j;
tran_low_t temp_in[32], temp_out[32]; tran_low_t temp_in[32], temp_out[32];
@ -1212,8 +1202,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out); idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -1224,7 +1213,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
tran_low_t out[32 * 32] = {0}; tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out; tran_low_t *outptr = out;
int i, j; int i, j;
tran_low_t temp_in[32], temp_out[32]; tran_low_t temp_in[32], temp_out[32];
@ -1239,8 +1228,7 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out); idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@ -1258,8 +1246,7 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
a1 = ROUND_POWER_OF_TWO(out, 6); a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
for (i = 0; i < 32; ++i) for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
dest[i] = clip_pixel_add(dest[i], a1);
dest += stride; dest += stride;
} }
} }
@ -1309,14 +1296,14 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
c1 = e1 - c1; c1 = e1 - c1;
a1 -= b1; a1 -= b1;
d1 += c1; d1 += c1;
dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], dest[stride * 0] =
HIGHBD_WRAPLOW(a1, bd), bd); highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], dest[stride * 1] =
HIGHBD_WRAPLOW(b1, bd), bd); highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], dest[stride * 2] =
HIGHBD_WRAPLOW(c1, bd), bd); highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], dest[stride * 3] =
HIGHBD_WRAPLOW(d1, bd), bd); highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
ip++; ip++;
dest++; dest++;
@ -1331,7 +1318,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
const tran_low_t *ip = in; const tran_low_t *ip = in;
tran_low_t *op = tmp; tran_low_t *op = tmp;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
(void) bd; (void)bd;
a1 = ip[0] >> UNIT_QUANT_SHIFT; a1 = ip[0] >> UNIT_QUANT_SHIFT;
e1 = a1 >> 1; e1 = a1 >> 1;
@ -1343,14 +1330,14 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1; e1 = ip[0] >> 1;
a1 = ip[0] - e1; a1 = ip[0] - e1;
dest[dest_stride * 0] = highbd_clip_pixel_add( dest[dest_stride * 0] =
dest[dest_stride * 0], a1, bd); highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
dest[dest_stride * 1] = highbd_clip_pixel_add( dest[dest_stride * 1] =
dest[dest_stride * 1], e1, bd); highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
dest[dest_stride * 2] = highbd_clip_pixel_add( dest[dest_stride * 2] =
dest[dest_stride * 2], e1, bd); highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
dest[dest_stride * 3] = highbd_clip_pixel_add( dest[dest_stride * 3] =
dest[dest_stride * 3], e1, bd); highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
ip++; ip++;
dest++; dest++;
} }
@ -1359,7 +1346,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step[4]; tran_low_t step[4];
tran_high_t temp1, temp2; tran_high_t temp1, temp2;
(void) bd; (void)bd;
// stage 1 // stage 1
temp1 = (input[0] + input[2]) * cospi_16_64; temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64;
@ -1394,8 +1381,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
// Columns // Columns
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
temp_in[j] = out[j * 4 + i];
vpx_highbd_idct4_c(temp_in, temp_out, bd); vpx_highbd_idct4_c(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) { for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -1408,8 +1394,8 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
int dest_stride, int bd) { int dest_stride, int bd) {
int i; int i;
tran_high_t a1; tran_high_t a1;
tran_low_t out = HIGHBD_WRAPLOW( tran_low_t out =
highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
@ -1486,8 +1472,7 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
// Then transform columns. // Then transform columns.
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd); vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -1500,14 +1485,13 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) { int stride, int bd) {
int i, j; int i, j;
tran_high_t a1; tran_high_t a1;
tran_low_t out = HIGHBD_WRAPLOW( tran_low_t out =
highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 5); a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest += stride; dest += stride;
} }
} }
@ -1519,7 +1503,7 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t x1 = input[1]; tran_low_t x1 = input[1];
tran_low_t x2 = input[2]; tran_low_t x2 = input[2];
tran_low_t x3 = input[3]; tran_low_t x3 = input[3];
(void) bd; (void)bd;
if (!(x0 | x1 | x2 | x3)) { if (!(x0 | x1 | x2 | x3)) {
memset(output, 0, 4 * sizeof(*output)); memset(output, 0, 4 * sizeof(*output));
@ -1561,7 +1545,7 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t x5 = input[4]; tran_low_t x5 = input[4];
tran_low_t x6 = input[1]; tran_low_t x6 = input[1];
tran_low_t x7 = input[6]; tran_low_t x7 = input[6];
(void) bd; (void)bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
memset(output, 0, 8 * sizeof(*output)); memset(output, 0, 8 * sizeof(*output));
@ -1644,8 +1628,7 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
} }
// Then transform columns. // Then transform columns.
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd); vpx_highbd_idct8_c(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -1657,25 +1640,25 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[16], step2[16]; tran_low_t step1[16], step2[16];
tran_high_t temp1, temp2; tran_high_t temp1, temp2;
(void) bd; (void)bd;
// stage 1 // stage 1
step1[0] = input[0/2]; step1[0] = input[0 / 2];
step1[1] = input[16/2]; step1[1] = input[16 / 2];
step1[2] = input[8/2]; step1[2] = input[8 / 2];
step1[3] = input[24/2]; step1[3] = input[24 / 2];
step1[4] = input[4/2]; step1[4] = input[4 / 2];
step1[5] = input[20/2]; step1[5] = input[20 / 2];
step1[6] = input[12/2]; step1[6] = input[12 / 2];
step1[7] = input[28/2]; step1[7] = input[28 / 2];
step1[8] = input[2/2]; step1[8] = input[2 / 2];
step1[9] = input[18/2]; step1[9] = input[18 / 2];
step1[10] = input[10/2]; step1[10] = input[10 / 2];
step1[11] = input[26/2]; step1[11] = input[26 / 2];
step1[12] = input[6/2]; step1[12] = input[6 / 2];
step1[13] = input[22/2]; step1[13] = input[22 / 2];
step1[14] = input[14/2]; step1[14] = input[14 / 2];
step1[15] = input[30/2]; step1[15] = input[30 / 2];
// stage 2 // stage 2
step2[0] = step1[0]; step2[0] = step1[0];
@ -1837,8 +1820,7 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
// Then transform columns. // Then transform columns.
for (i = 0; i < 16; ++i) { for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd); vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -1867,10 +1849,10 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t x13 = input[12]; tran_low_t x13 = input[12];
tran_low_t x14 = input[1]; tran_low_t x14 = input[1];
tran_low_t x15 = input[14]; tran_low_t x15 = input[14];
(void) bd; (void)bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) { x13 | x14 | x15)) {
memset(output, 0, 16 * sizeof(*output)); memset(output, 0, 16 * sizeof(*output));
return; return;
} }
@ -1981,13 +1963,13 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
// stage 4 // stage 4
s2 = (- cospi_16_64) * (x2 + x3); s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3); s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7); s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (-x6 + x7); s7 = cospi_16_64 * (-x6 + x7);
s10 = cospi_16_64 * (x10 + x11); s10 = cospi_16_64 * (x10 + x11);
s11 = cospi_16_64 * (-x10 + x11); s11 = cospi_16_64 * (-x10 + x11);
s14 = (- cospi_16_64) * (x14 + x15); s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15); s15 = cospi_16_64 * (x14 - x15);
x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
@ -2035,8 +2017,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
// Then transform columns. // Then transform columns.
for (i = 0; i < 16; ++i) { for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
temp_in[j] = out[j*16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd); vpx_highbd_idct16_c(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -2049,24 +2030,23 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) { int stride, int bd) {
int i, j; int i, j;
tran_high_t a1; tran_high_t a1;
tran_low_t out = HIGHBD_WRAPLOW( tran_low_t out =
highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6); a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i) for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest += stride; dest += stride;
} }
} }
static void highbd_idct32_c(const tran_low_t *input, static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
tran_low_t *output, int bd) { int bd) {
tran_low_t step1[32], step2[32]; tran_low_t step1[32], step2[32];
tran_high_t temp1, temp2; tran_high_t temp1, temp2;
(void) bd; (void)bd;
// stage 1 // stage 1
step1[0] = input[0]; step1[0] = input[0];
@ -2442,8 +2422,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
// Rows // Rows
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
tran_low_t zero_coeff[16]; tran_low_t zero_coeff[16];
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
zero_coeff[j] = input[2 * j] | input[2 * j + 1];
for (j = 0; j < 8; ++j) for (j = 0; j < 8; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j)
@ -2461,8 +2440,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
temp_in[j] = out[j * 32 + i];
highbd_idct32_c(temp_in, temp_out, bd); highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -2473,7 +2451,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) { int stride, int bd) {
tran_low_t out[32 * 32] = {0}; tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out; tran_low_t *outptr = out;
int i, j; int i, j;
tran_low_t temp_in[32], temp_out[32]; tran_low_t temp_in[32], temp_out[32];
@ -2488,8 +2466,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
} }
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
temp_in[j] = out[j * 32 + i];
highbd_idct32_c(temp_in, temp_out, bd); highbd_idct32_c(temp_in, temp_out, bd);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i] = highbd_clip_pixel_add(
@ -2504,14 +2481,13 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
int a1; int a1;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
tran_low_t out = HIGHBD_WRAPLOW( tran_low_t out =
highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6); a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) { for (j = 0; j < 32; ++j) {
for (i = 0; i < 32; ++i) for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
dest += stride; dest += stride;
} }
} }

View File

@ -41,8 +41,7 @@ static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
} }
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
static INLINE tran_high_t highbd_check_range(tran_high_t input, static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
int bd) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING #if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid highbitdepth VP9 streams, intermediate stage coefficients will // For valid highbitdepth VP9 streams, intermediate stage coefficients will
// stay within the ranges: // stay within the ranges:
@ -53,9 +52,9 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input,
const int32_t int_min = -int_max - 1; const int32_t int_min = -int_max - 1;
assert(int_min <= input); assert(int_min <= input);
assert(input <= int_max); assert(input <= int_max);
(void) int_min; (void)int_min;
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
(void) bd; (void)bd;
return input; return input;
} }
@ -93,8 +92,7 @@ static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
#define WRAPLOW(x) ((int32_t)check_range(x)) #define WRAPLOW(x) ((int32_t)check_range(x))
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
#define HIGHBD_WRAPLOW(x, bd) \ #define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
((int32_t)highbd_check_range((x), bd))
#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_EMULATE_HARDWARE #endif // CONFIG_EMULATE_HARDWARE

View File

@ -22,23 +22,18 @@ static INLINE int8_t signed_char_clamp(int t) {
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
static INLINE int16_t signed_char_clamp_high(int t, int bd) { static INLINE int16_t signed_char_clamp_high(int t, int bd) {
switch (bd) { switch (bd) {
case 10: case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
return (int16_t)clamp(t, -128*4, 128*4-1); case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
case 12:
return (int16_t)clamp(t, -128*16, 128*16-1);
case 8: case 8:
default: default: return (int16_t)clamp(t, -128, 128 - 1);
return (int16_t)clamp(t, -128, 128-1);
} }
} }
#endif #endif
// should we apply any filter at all: 11111111 yes, 00000000 no // should we apply any filter at all: 11111111 yes, 00000000 no
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
uint8_t p3, uint8_t p2, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
uint8_t p1, uint8_t p0, uint8_t q1, uint8_t q2, uint8_t q3) {
uint8_t q0, uint8_t q1,
uint8_t q2, uint8_t q3) {
int8_t mask = 0; int8_t mask = 0;
mask |= (abs(p3 - p2) > limit) * -1; mask |= (abs(p3 - p2) > limit) * -1;
mask |= (abs(p2 - p1) > limit) * -1; mask |= (abs(p2 - p1) > limit) * -1;
@ -50,10 +45,8 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
return ~mask; return ~mask;
} }
static INLINE int8_t flat_mask4(uint8_t thresh, static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
uint8_t p1, uint8_t p0,
uint8_t q0, uint8_t q1,
uint8_t q2, uint8_t q3) { uint8_t q2, uint8_t q3) {
int8_t mask = 0; int8_t mask = 0;
mask |= (abs(p1 - p0) > thresh) * -1; mask |= (abs(p1 - p0) > thresh) * -1;
@ -65,12 +58,10 @@ static INLINE int8_t flat_mask4(uint8_t thresh,
return ~mask; return ~mask;
} }
static INLINE int8_t flat_mask5(uint8_t thresh, static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
uint8_t p4, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
uint8_t p2, uint8_t p1, uint8_t q1, uint8_t q2, uint8_t q3,
uint8_t p0, uint8_t q0, uint8_t q4) {
uint8_t q1, uint8_t q2,
uint8_t q3, uint8_t q4) {
int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
mask |= (abs(p4 - p0) > thresh) * -1; mask |= (abs(p4 - p0) > thresh) * -1;
mask |= (abs(q4 - q0) > thresh) * -1; mask |= (abs(q4 - q0) > thresh) * -1;
@ -90,10 +81,10 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
int8_t filter1, filter2; int8_t filter1, filter2;
const int8_t ps1 = (int8_t) *op1 ^ 0x80; const int8_t ps1 = (int8_t)*op1 ^ 0x80;
const int8_t ps0 = (int8_t) *op0 ^ 0x80; const int8_t ps0 = (int8_t)*op0 ^ 0x80;
const int8_t qs0 = (int8_t) *oq0 ^ 0x80; const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
const int8_t qs1 = (int8_t) *oq1 ^ 0x80; const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
// add outer taps if we have high edge variance // add outer taps if we have high edge variance
@ -128,8 +119,8 @@ void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3); filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s; ++s;
} }
@ -152,8 +143,8 @@ void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3); filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
filter4(mask, *thresh, s - 2, s - 1, s, s + 1); filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
s += pitch; s += pitch;
} }
@ -168,9 +159,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
} }
static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
uint8_t *op3, uint8_t *op2, uint8_t *op3, uint8_t *op2, uint8_t *op1,
uint8_t *op1, uint8_t *op0, uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
uint8_t *oq0, uint8_t *oq1,
uint8_t *oq2, uint8_t *oq3) { uint8_t *oq2, uint8_t *oq3) {
if (flat && mask) { if (flat && mask) {
const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@ -198,11 +188,11 @@ void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3); filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
s, s + 1 * p, s + 2 * p, s + 3 * p); s + 1 * p, s + 2 * p, s + 3 * p);
++s; ++s;
} }
} }
@ -222,11 +212,11 @@ void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3); filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
s, s + 1, s + 2, s + 3); s + 3);
s += pitch; s += pitch;
} }
} }
@ -239,52 +229,55 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
} }
static INLINE void filter16(int8_t mask, uint8_t thresh, static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
uint8_t flat, uint8_t flat2, uint8_t flat2, uint8_t *op7, uint8_t *op6,
uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3,
uint8_t *op5, uint8_t *op4, uint8_t *op2, uint8_t *op1, uint8_t *op0,
uint8_t *op3, uint8_t *op2, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
uint8_t *op1, uint8_t *op0, uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
uint8_t *oq0, uint8_t *oq1,
uint8_t *oq2, uint8_t *oq3,
uint8_t *oq4, uint8_t *oq5,
uint8_t *oq6, uint8_t *oq7) { uint8_t *oq6, uint8_t *oq7) {
if (flat2 && flat && mask) { if (flat2 && flat && mask) {
const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; p2 = *op2, p1 = *op1, p0 = *op0;
const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; q5 = *oq5, q6 = *oq6, q7 = *oq7;
// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
*op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + *op6 = ROUND_POWER_OF_TWO(
q0, 4); p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
*op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + *op5 = ROUND_POWER_OF_TWO(
q0 + q1, 4); p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
*op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + *op4 = ROUND_POWER_OF_TWO(
q0 + q1 + q2, 4); p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
*op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + *op3 = ROUND_POWER_OF_TWO(
q0 + q1 + q2 + q3, 4); p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
*op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + *op2 = ROUND_POWER_OF_TWO(
q0 + q1 + q2 + q3 + q4, 4); p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
4);
*op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
q0 + q1 + q2 + q3 + q4 + q5, 4); q0 + q1 + q2 + q3 + q4 + q5,
*op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + 4);
q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
*oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q1 + q2 + q3 + q4 + q5 + q6,
q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); 4);
*oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); q2 + q3 + q4 + q5 + q6 + q7,
*oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + 4);
q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
*oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q3 + q4 + q5 + q6 + q7 * 2,
q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); 4);
*oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + *oq2 = ROUND_POWER_OF_TWO(
q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
*oq5 = ROUND_POWER_OF_TWO(p1 + p0 + 4);
q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); *oq3 = ROUND_POWER_OF_TWO(
*oq6 = ROUND_POWER_OF_TWO(p0 + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); *oq4 = ROUND_POWER_OF_TWO(
p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
*oq5 = ROUND_POWER_OF_TWO(
p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
*oq6 = ROUND_POWER_OF_TWO(
p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
} else { } else {
filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
} }
@ -300,18 +293,17 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
for (i = 0; i < 8 * count; ++i) { for (i = 0; i < 8 * count; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3); filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat2 = flat_mask5(1, const int8_t flat2 =
s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
filter16(mask, *thresh, flat, flat2, filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
s, s + 1 * p, s + 2 * p, s + 3 * p, s + 7 * p);
s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
++s; ++s;
} }
} }
@ -326,25 +318,23 @@ void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
} }
static void mb_lpf_vertical_edge_w(uint8_t *s, int p, static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh,
const uint8_t *limit,
const uint8_t *thresh,
int count) { int count) {
int i; int i;
for (i = 0; i < count; ++i) { for (i = 0; i < count; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3); filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
q0, s[4], s[5], s[6], s[7]); s[5], s[6], s[7]);
filter16(mask, *thresh, flat, flat2, filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); s + 7);
s += p; s += p;
} }
} }
@ -362,9 +352,8 @@ void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
// Should we apply any filter at all: 11111111 yes, 00000000 no ? // Should we apply any filter at all: 11111111 yes, 00000000 no ?
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
uint16_t p3, uint16_t p2, uint16_t p3, uint16_t p2, uint16_t p1,
uint16_t p1, uint16_t p0, uint16_t p0, uint16_t q0, uint16_t q1,
uint16_t q0, uint16_t q1,
uint16_t q2, uint16_t q3, int bd) { uint16_t q2, uint16_t q3, int bd) {
int8_t mask = 0; int8_t mask = 0;
int16_t limit16 = (uint16_t)limit << (bd - 8); int16_t limit16 = (uint16_t)limit << (bd - 8);
@ -379,11 +368,10 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
return ~mask; return ~mask;
} }
static INLINE int8_t highbd_flat_mask4(uint8_t thresh, static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0,
uint16_t p1, uint16_t p0, uint16_t q1, uint16_t q2, uint16_t q3,
uint16_t q0, uint16_t q1, int bd) {
uint16_t q2, uint16_t q3, int bd) {
int8_t mask = 0; int8_t mask = 0;
int16_t thresh16 = (uint16_t)thresh << (bd - 8); int16_t thresh16 = (uint16_t)thresh << (bd - 8);
mask |= (abs(p1 - p0) > thresh16) * -1; mask |= (abs(p1 - p0) > thresh16) * -1;
@ -395,11 +383,9 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
return ~mask; return ~mask;
} }
static INLINE int8_t highbd_flat_mask5(uint8_t thresh, static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
uint16_t p4, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0,
uint16_t p2, uint16_t p1, uint16_t q0, uint16_t q1, uint16_t q2,
uint16_t p0, uint16_t q0,
uint16_t q1, uint16_t q2,
uint16_t q3, uint16_t q4, int bd) { uint16_t q3, uint16_t q4, int bd) {
int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
int16_t thresh16 = (uint16_t)thresh << (bd - 8); int16_t thresh16 = (uint16_t)thresh << (bd - 8);
@ -470,21 +456,17 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
const uint16_t q1 = s[1 * p]; const uint16_t q1 = s[1 * p];
const uint16_t q2 = s[2 * p]; const uint16_t q2 = s[2 * p];
const uint16_t q3 = s[3 * p]; const uint16_t q3 = s[3 * p];
const int8_t mask = highbd_filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
++s; ++s;
} }
} }
void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p, void vpx_highbd_lpf_horizontal_4_dual_c(
const uint8_t *blimit0, uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh0, const uint8_t *thresh1, int bd) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1,
int bd) {
vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
} }
@ -499,30 +481,25 @@ void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = highbd_filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
s += pitch; s += pitch;
} }
} }
void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, void vpx_highbd_lpf_vertical_4_dual_c(
const uint8_t *blimit0, uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh0, const uint8_t *thresh1, int bd) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1,
int bd) {
vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
thresh1, bd); bd);
} }
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
uint16_t *op3, uint16_t *op2, uint16_t *op3, uint16_t *op2, uint16_t *op1,
uint16_t *op1, uint16_t *op0, uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
uint16_t *oq0, uint16_t *oq1,
uint16_t *oq2, uint16_t *oq3, int bd) { uint16_t *oq2, uint16_t *oq3, int bd) {
if (flat && mask) { if (flat && mask) {
const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@ -551,25 +528,20 @@ void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = highbd_filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, const int8_t flat =
bd); highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
highbd_filter8(mask, *thresh, flat, highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
++s; ++s;
} }
} }
void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p, void vpx_highbd_lpf_horizontal_8_dual_c(
const uint8_t *blimit0, uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh0, const uint8_t *thresh1, int bd) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1,
int bd) {
vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
} }
@ -582,40 +554,31 @@ void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = highbd_filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, const int8_t flat =
bd); highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
highbd_filter8(mask, *thresh, flat, highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
s - 4, s - 3, s - 2, s - 1, s + 2, s + 3, bd);
s, s + 1, s + 2, s + 3,
bd);
s += pitch; s += pitch;
} }
} }
void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, void vpx_highbd_lpf_vertical_8_dual_c(
const uint8_t *blimit0, uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh0, const uint8_t *thresh1, int bd) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1,
int bd) {
vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
thresh1, bd); bd);
} }
static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
uint8_t flat, uint8_t flat2, uint8_t flat2, uint16_t *op7, uint16_t *op6,
uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3,
uint16_t *op5, uint16_t *op4, uint16_t *op2, uint16_t *op1, uint16_t *op0,
uint16_t *op3, uint16_t *op2, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
uint16_t *op1, uint16_t *op0, uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
uint16_t *oq0, uint16_t *oq1,
uint16_t *oq2, uint16_t *oq3,
uint16_t *oq4, uint16_t *oq5,
uint16_t *oq6, uint16_t *oq7, int bd) { uint16_t *oq6, uint16_t *oq7, int bd) {
if (flat2 && flat && mask) { if (flat2 && flat && mask) {
const uint16_t p7 = *op7; const uint16_t p7 = *op7;
@ -636,34 +599,40 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
const uint16_t q7 = *oq7; const uint16_t q7 = *oq7;
// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
*op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + *op6 = ROUND_POWER_OF_TWO(
q0, 4); p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
*op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + *op5 = ROUND_POWER_OF_TWO(
q0 + q1, 4); p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
*op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + *op4 = ROUND_POWER_OF_TWO(
q0 + q1 + q2, 4); p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
*op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + *op3 = ROUND_POWER_OF_TWO(
q0 + q1 + q2 + q3, 4); p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
*op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + *op2 = ROUND_POWER_OF_TWO(
q0 + q1 + q2 + q3 + q4, 4); p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
4);
*op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
q0 + q1 + q2 + q3 + q4 + q5, 4); q0 + q1 + q2 + q3 + q4 + q5,
*op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + 4);
q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
*oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q1 + q2 + q3 + q4 + q5 + q6,
q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); 4);
*oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); q2 + q3 + q4 + q5 + q6 + q7,
*oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + 4);
q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
*oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q3 + q4 + q5 + q6 + q7 * 2,
q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); 4);
*oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + *oq2 = ROUND_POWER_OF_TWO(
q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
*oq5 = ROUND_POWER_OF_TWO(p1 + p0 + 4);
q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); *oq3 = ROUND_POWER_OF_TWO(
*oq6 = ROUND_POWER_OF_TWO(p0 + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); *oq4 = ROUND_POWER_OF_TWO(
p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
*oq5 = ROUND_POWER_OF_TWO(
p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
*oq6 = ROUND_POWER_OF_TWO(
p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
} else { } else {
highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
bd); bd);
@ -673,8 +642,8 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
const uint8_t *blimit, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh, const uint8_t *thresh, int count,
int count, int bd) { int bd) {
int i; int i;
// loop filter designed to work using chars so that we can make maximum use // loop filter designed to work using chars so that we can make maximum use
@ -688,20 +657,18 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
const uint16_t q1 = s[1 * p]; const uint16_t q1 = s[1 * p];
const uint16_t q2 = s[2 * p]; const uint16_t q2 = s[2 * p];
const uint16_t q3 = s[3 * p]; const uint16_t q3 = s[3 * p];
const int8_t mask = highbd_filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, const int8_t flat =
bd); highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat2 = highbd_flat_mask5( const int8_t flat2 =
1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
highbd_filter16(mask, *thresh, flat, flat2, highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
s, s + 1 * p, s + 2 * p, s + 3 * p, s + 6 * p, s + 7 * p, bd);
s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
bd);
++s; ++s;
} }
} }
@ -723,8 +690,8 @@ void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
const uint8_t *blimit, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh, const uint8_t *thresh, int count,
int count, int bd) { int bd) {
int i; int i;
for (i = 0; i < count; ++i) { for (i = 0; i < count; ++i) {
@ -736,17 +703,16 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
const uint16_t q1 = s[1]; const uint16_t q1 = s[1];
const uint16_t q2 = s[2]; const uint16_t q2 = s[2];
const uint16_t q3 = s[3]; const uint16_t q3 = s[3];
const int8_t mask = highbd_filter_mask(*limit, *blimit, const int8_t mask =
p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, const int8_t flat =
bd); highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
q0, s[4], s[5], s[6], s[7], bd); q0, s[4], s[5], s[6], s[7], bd);
highbd_filter16(mask, *thresh, flat, flat2, highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, s + 5, s + 6, s + 7, bd);
bd);
s += p; s += p;
} }
} }
@ -760,8 +726,7 @@ void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
const uint8_t *blimit, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh, const uint8_t *thresh, int bd) {
int bd) {
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
} }
#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_HIGHBITDEPTH

View File

@ -12,8 +12,8 @@
#include "./macros_msa.h" #include "./macros_msa.h"
void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise, void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
int blackclamp, int whiteclamp, int blackclamp, int whiteclamp, int width,
int width, int height, int32_t pitch) { int height, int32_t pitch) {
uint32_t i, j; uint32_t i, j;
for (i = 0; i < height / 2; ++i) { for (i = 0; i < height / 2; ++i) {

View File

@ -24,37 +24,21 @@ extern "C" {
extern uint8_t *vpx_ff_cropTbl; // From "vpx_dsp/mips/intrapred4_dspr2.c" extern uint8_t *vpx_ff_cropTbl; // From "vpx_dsp/mips/intrapred4_dspr2.c"
static INLINE void prefetch_load(const unsigned char *src) { static INLINE void prefetch_load(const unsigned char *src) {
__asm__ __volatile__ ( __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
"pref 0, 0(%[src]) \n\t"
:
: [src] "r" (src)
);
} }
/* prefetch data for store */ /* prefetch data for store */
static INLINE void prefetch_store(unsigned char *dst) { static INLINE void prefetch_store(unsigned char *dst) {
__asm__ __volatile__ ( __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
"pref 1, 0(%[dst]) \n\t"
:
: [dst] "r" (dst)
);
} }
static INLINE void prefetch_load_streamed(const unsigned char *src) { static INLINE void prefetch_load_streamed(const unsigned char *src) {
__asm__ __volatile__ ( __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src));
"pref 4, 0(%[src]) \n\t"
:
: [src] "r" (src)
);
} }
/* prefetch data for store */ /* prefetch data for store */
static INLINE void prefetch_store_streamed(unsigned char *dst) { static INLINE void prefetch_store_streamed(unsigned char *dst) {
__asm__ __volatile__ ( __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst));
"pref 5, 0(%[dst]) \n\t"
:
: [dst] "r" (dst)
);
} }
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t w,
int32_t dst_stride,
const int16_t *filter_y,
int32_t w,
int32_t h) { int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
@ -48,7 +45,7 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -105,16 +102,13 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
[scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[store1] "=&r" (store1), [store2] "=&r" (store2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[src_ptr] "+r" (src_ptr) : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
: [filter45] "r" (filter45), [vector4a] "r" (vector4a), [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
[src_stride] "r" (src_stride), [cm] "r" (cm),
[dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -124,11 +118,9 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
} }
static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_y, const int16_t *filter_y, int32_t h) {
int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
uint8_t *dst_ptr; uint8_t *dst_ptr;
@ -140,7 +132,7 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
uint32_t store1, store2; uint32_t store1, store2;
int32_t Temp1, Temp2; int32_t Temp1, Temp2;
const int16_t *filter = &filter_y[3]; const int16_t *filter = &filter_y[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -153,7 +145,7 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -210,16 +202,13 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
[scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[store1] "=&r" (store1), [store2] "=&r" (store2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[src_ptr] "+r" (src_ptr) : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
: [filter45] "r" (filter45), [vector4a] "r" (vector4a), [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
[src_stride] "r" (src_stride), [cm] "r" (cm),
[dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -231,18 +220,16 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
uint32_t pos = 38; uint32_t pos = 38;
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
prefetch_store(dst); prefetch_store(dst);
@ -251,22 +238,17 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
case 8: case 8:
case 16: case 16:
case 32: case 32:
convolve_bi_avg_vert_4_dspr2(src, src_stride, convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
dst, dst_stride, w, h);
filter_y, w, h);
break; break;
case 64: case 64:
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_bi_avg_vert_64_dspr2(src, src_stride, convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
dst, dst_stride, h);
filter_y, h);
break; break;
default: default:
vpx_convolve8_avg_vert_c(src, src_stride, vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

View File

@ -19,11 +19,9 @@
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h) {
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
int32_t Temp1, Temp2, Temp3, Temp4; int32_t Temp1, Temp2, Temp3, Temp4;
@ -42,7 +40,7 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -98,14 +96,12 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
"addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
"sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp4] "=&r"(Temp4)
[Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
: [filter45] "r" (filter45), [vector4a] "r" (vector4a), [dst] "r"(dst), [src] "r"(src));
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -114,11 +110,9 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
} }
static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h) {
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64; uint32_t vector4a = 64;
@ -127,7 +121,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
uint32_t p1, p2, p3, p4, n1; uint32_t p1, p2, p3, p4, n1;
uint32_t st0, st1; uint32_t st0, st1;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -137,7 +131,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -246,15 +240,12 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
"sb %[tp4], 5(%[dst]) \n\t" "sb %[tp4], 5(%[dst]) \n\t"
"sb %[tp1], 7(%[dst]) \n\t" "sb %[tp1], 7(%[dst]) \n\t"
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp3] "=&r" (tp3), [tp4] "=&r" (tp4), [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
[st0] "=&r" (st0), [st1] "=&r" (st1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
[n1] "=&r" (n1), : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [dst] "r"(dst), [src] "r"(src));
: [filter45] "r" (filter45), [vector4a] "r" (vector4a),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -263,11 +254,9 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
} }
static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
int32_t src_stride, int32_t src_stride, uint8_t *dst_ptr,
uint8_t *dst_ptr,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h,
int32_t h,
int32_t count) { int32_t count) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
@ -279,7 +268,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5; uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3; uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -293,7 +282,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride); prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) { for (c = 0; c < count; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -493,14 +482,13 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
[qload3] "=&r" (qload3), [p5] "=&r" (p5), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp3] "=&r"(Temp3)
: [filter45] "r" (filter45), [vector_64] "r" (vector_64), : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [dst] "r"(dst), [src] "r"(src));
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -513,8 +501,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
} }
static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
int32_t src_stride, int32_t src_stride, uint8_t *dst_ptr,
uint8_t *dst_ptr,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0,
int32_t h) { int32_t h) {
@ -528,7 +515,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5; uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3; uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -544,7 +531,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32); prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) { for (c = 0; c < 4; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -744,14 +731,13 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
[qload3] "=&r" (qload3), [p5] "=&r" (p5), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp3] "=&r"(Temp3)
: [filter45] "r" (filter45), [vector_64] "r" (vector_64), : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [dst] "r"(dst), [src] "r"(src));
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -773,11 +759,9 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
/* prefetch data to cache memory */ /* prefetch data to cache memory */
prefetch_load(src); prefetch_load(src);
@ -786,39 +770,31 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) { switch (w) {
case 4: case 4:
convolve_bi_avg_horiz_4_dspr2(src, src_stride, convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h);
filter_x, h);
break; break;
case 8: case 8:
convolve_bi_avg_horiz_8_dspr2(src, src_stride, convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h);
filter_x, h);
break; break;
case 16: case 16:
convolve_bi_avg_horiz_16_dspr2(src, src_stride, convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h, 1);
filter_x, h, 1);
break; break;
case 32: case 32:
convolve_bi_avg_horiz_16_dspr2(src, src_stride, convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h, 2);
filter_x, h, 2);
break; break;
case 64: case 64:
prefetch_load(src + 64); prefetch_load(src + 64);
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_bi_avg_horiz_64_dspr2(src, src_stride, convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h);
filter_x, h);
break; break;
default: default:
vpx_convolve8_avg_horiz_c(src, src_stride, vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_bi_horiz_4_dspr2(const uint8_t *src, static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
int32_t Temp1, Temp2, Temp3, Temp4; int32_t Temp1, Temp2, Temp3, Temp4;
@ -31,7 +28,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
uint32_t tp1, tp2; uint32_t tp1, tp2;
uint32_t p1, p2; uint32_t p1, p2;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -41,7 +38,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -86,13 +83,11 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
"sb %[tp2], 2(%[dst]) \n\t" "sb %[tp2], 2(%[dst]) \n\t"
"sb %[p2], 3(%[dst]) \n\t" "sb %[p2], 3(%[dst]) \n\t"
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp4] "=&r"(Temp4)
[Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
: [filter45] "r" (filter45), [vector4a] "r" (vector4a), [dst] "r"(dst), [src] "r"(src));
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -100,12 +95,9 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
} }
} }
static void convolve_bi_horiz_8_dspr2(const uint8_t *src, static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64; uint32_t vector4a = 64;
@ -114,7 +106,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
uint32_t p1, p2, p3, p4; uint32_t p1, p2, p3, p4;
uint32_t st0, st1; uint32_t st0, st1;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -124,7 +116,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -210,13 +202,12 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
"sb %[p2], 5(%[dst]) \n\t" "sb %[p2], 5(%[dst]) \n\t"
"sb %[p1], 7(%[dst]) \n\t" "sb %[p1], 7(%[dst]) \n\t"
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[st0] "=&r" (st0), [st1] "=&r" (st1), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
: [filter45] "r" (filter45), [vector4a] "r" (vector4a), : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [dst] "r"(dst), [src] "r"(src));
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -225,11 +216,9 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
} }
static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
int32_t src_stride, int32_t src_stride, uint8_t *dst_ptr,
uint8_t *dst_ptr,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h,
int32_t h,
int32_t count) { int32_t count) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
@ -241,7 +230,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5; uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3; uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -255,7 +244,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride); prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) { for (c = 0; c < count; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -413,14 +402,13 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p5] "=&r" (p5), [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
: [filter45] "r" (filter45), [vector_64] "r" (vector_64), : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [dst] "r"(dst), [src] "r"(src));
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -433,11 +421,9 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
} }
static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
int32_t src_stride, int32_t src_stride, uint8_t *dst_ptr,
uint8_t *dst_ptr,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h) {
int32_t h) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
uint8_t *dst; uint8_t *dst;
@ -448,7 +434,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
uint32_t p1, p2, p3, p4, p5; uint32_t p1, p2, p3, p4, p5;
uint32_t st1, st2, st3; uint32_t st1, st2, st3;
const int16_t *filter = &filter_x0[3]; const int16_t *filter = &filter_x0[3];
uint32_t filter45;; uint32_t filter45;
filter45 = ((const int32_t *)filter)[0]; filter45 = ((const int32_t *)filter)[0];
@ -464,7 +450,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32); prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) { for (c = 0; c < 4; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -622,14 +608,13 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p5] "=&r" (p5), [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
: [filter45] "r" (filter45), [vector_64] "r" (vector_64), : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [dst] "r"(dst), [src] "r"(src));
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -644,8 +629,8 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
uint32_t pos = 38; uint32_t pos = 38;
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
@ -653,11 +638,9 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
prefetch_load((const uint8_t *)filter_x); prefetch_load((const uint8_t *)filter_x);
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
/* prefetch data to cache memory */ /* prefetch data to cache memory */
prefetch_load(src); prefetch_load(src);
@ -666,39 +649,31 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) { switch (w) {
case 4: case 4:
convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h);
filter_x, (int32_t)h);
break; break;
case 8: case 8:
convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h);
filter_x, (int32_t)h);
break; break;
case 16: case 16:
convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h, 1);
filter_x, (int32_t)h, 1);
break; break;
case 32: case 32:
convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h, 2);
filter_x, (int32_t)h, 2);
break; break;
case 64: case 64:
prefetch_load(src + 64); prefetch_load(src + 64);
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h);
filter_x, (int32_t)h);
break; break;
default: default:
vpx_convolve8_horiz_c(src, src_stride, vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_bi_vert_4_dspr2(const uint8_t *src, static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t w,
int32_t dst_stride,
const int16_t *filter_y,
int32_t w,
int32_t h) { int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
@ -48,7 +45,7 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -98,16 +95,12 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
[scratch1] "=&r" (scratch1), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[store1] "=&r" (store1), [store2] "=&r" (store2), : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
[src_ptr] "+r" (src_ptr) [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
: [filter45] "r" (filter45),[vector4a] "r" (vector4a),
[src_stride] "r" (src_stride),
[cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -116,12 +109,9 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
} }
} }
static void convolve_bi_vert_64_dspr2(const uint8_t *src, static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t h) {
int32_t dst_stride,
const int16_t *filter_y,
int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
uint8_t *dst_ptr; uint8_t *dst_ptr;
@ -145,7 +135,7 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -195,16 +185,12 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
[scratch1] "=&r" (scratch1), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[store1] "=&r" (store1), [store2] "=&r" (store2), : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
[src_ptr] "+r" (src_ptr) [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
: [filter45] "r" (filter45),[vector4a] "r" (vector4a),
[src_stride] "r" (src_stride),
[cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -216,42 +202,34 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
uint32_t pos = 38; uint32_t pos = 38;
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
prefetch_store(dst); prefetch_store(dst);
switch (w) { switch (w) {
case 4 : case 4:
case 8 : case 8:
case 16 : case 16:
case 32 : case 32:
convolve_bi_vert_4_dspr2(src, src_stride, convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
dst, dst_stride, h);
filter_y, w, h);
break; break;
case 64 : case 64:
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_bi_vert_64_dspr2(src, src_stride, convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
dst, dst_stride,
filter_y, h);
break; break;
default: default:
vpx_convolve8_vert_c(src, src_stride, vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_avg_vert_4_dspr2(const uint8_t *src, static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t w,
int32_t dst_stride,
const int16_t *filter_y,
int32_t w,
int32_t h) { int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
@ -53,7 +50,7 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -160,18 +157,16 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
[scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[store1] "=&r" (store1), [store2] "=&r" (store2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[src_ptr] "+r" (src_ptr) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
[vector4a] "r" (vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
[src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -180,12 +175,9 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
} }
} }
static void convolve_avg_vert_64_dspr2(const uint8_t *src, static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t h) {
int32_t dst_stride,
const int16_t *filter_y,
int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
uint8_t *dst_ptr; uint8_t *dst_ptr;
@ -215,7 +207,7 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -322,18 +314,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
[scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[store1] "=&r" (store1), [store2] "=&r" (store2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[src_ptr] "+r" (src_ptr) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
[vector4a] "r" (vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
[src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -345,26 +335,21 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000); assert(((const int32_t *)filter_y)[1] != 0x800000);
if (((const int32_t *)filter_y)[0] == 0) { if (((const int32_t *)filter_y)[0] == 0) {
vpx_convolve2_avg_vert_dspr2(src, src_stride, vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
} else { } else {
uint32_t pos = 38; uint32_t pos = 38;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
prefetch_store(dst); prefetch_store(dst);
@ -373,22 +358,17 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
case 8: case 8:
case 16: case 16:
case 32: case 32:
convolve_avg_vert_4_dspr2(src, src_stride, convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
dst, dst_stride, h);
filter_y, w, h);
break; break;
case 64: case 64:
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_avg_vert_64_dspr2(src, src_stride, convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
dst, dst_stride, h);
filter_y, h);
break; break;
default: default:
vpx_convolve8_avg_vert_c(src, src_stride, vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }
@ -397,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
/* Fixed size intermediate buffer places limits on parameters. */ /* Fixed size intermediate buffer places limits on parameters. */
DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
@ -408,27 +388,20 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
if (intermediate_height < h) if (intermediate_height < h) intermediate_height = h;
intermediate_height = h;
vpx_convolve8_horiz(src - (src_stride * 3), src_stride, vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
temp, 64, x_step_q4, filter_y, y_step_q4, w, intermediate_height);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, intermediate_height);
vpx_convolve8_avg_vert(temp + 64 * 3, 64, vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
} }
void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride, const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride, const int16_t *filter_y, int filter_y_stride, int w,
int w, int h) { int h) {
int x, y; int x, y;
uint32_t tp1, tp2, tn1; uint32_t tp1, tp2, tn1;
uint32_t tp3, tp4, tn2; uint32_t tp3, tp4, tn2;
@ -441,21 +414,19 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) { switch (w) {
case 4: case 4:
/* 1 word storage */ /* 1 word storage */
for (y = h; y--; ) { for (y = h; y--;) {
prefetch_load(src + src_stride); prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t"
"adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
"sw %[tn1], 0(%[dst]) \n\t" /* store */ "sw %[tn1], 0(%[dst]) \n\t" /* store */
: [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
[tp2] "=&r" (tp2) : [src] "r"(src), [dst] "r"(dst));
: [src] "r" (src), [dst] "r" (dst)
);
src += src_stride; src += src_stride;
dst += dst_stride; dst += dst_stride;
@ -463,12 +434,12 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
break; break;
case 8: case 8:
/* 2 word storage */ /* 2 word storage */
for (y = h; y--; ) { for (y = h; y--;) {
prefetch_load(src + src_stride); prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp3], 4(%[src]) \n\t"
@ -478,11 +449,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
"sw %[tn2], 4(%[dst]) \n\t" /* store */ "sw %[tn2], 4(%[dst]) \n\t" /* store */
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp3] "=&r" (tp3), [tp4] "=&r" (tp4), [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2) : [src] "r"(src), [dst] "r"(dst));
: [src] "r" (src), [dst] "r" (dst)
);
src += src_stride; src += src_stride;
dst += dst_stride; dst += dst_stride;
@ -490,12 +459,12 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
break; break;
case 16: case 16:
/* 4 word storage */ /* 4 word storage */
for (y = h; y--; ) { for (y = h; y--;) {
prefetch_load(src + src_stride); prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp3], 4(%[src]) \n\t"
@ -513,11 +482,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
"sw %[tn2], 12(%[dst]) \n\t" /* store */ "sw %[tn2], 12(%[dst]) \n\t" /* store */
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp3] "=&r" (tp3), [tp4] "=&r" (tp4), [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2) : [src] "r"(src), [dst] "r"(dst));
: [src] "r" (src), [dst] "r" (dst)
);
src += src_stride; src += src_stride;
dst += dst_stride; dst += dst_stride;
@ -525,12 +492,12 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
break; break;
case 32: case 32:
/* 8 word storage */ /* 8 word storage */
for (y = h; y--; ) { for (y = h; y--;) {
prefetch_load(src + src_stride); prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp3], 4(%[src]) \n\t"
@ -564,11 +531,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
"sw %[tn2], 28(%[dst]) \n\t" /* store */ "sw %[tn2], 28(%[dst]) \n\t" /* store */
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp3] "=&r" (tp3), [tp4] "=&r" (tp4), [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2) : [src] "r"(src), [dst] "r"(dst));
: [src] "r" (src), [dst] "r" (dst)
);
src += src_stride; src += src_stride;
dst += dst_stride; dst += dst_stride;
@ -579,14 +544,14 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
prefetch_store(dst + 32); prefetch_store(dst + 32);
/* 16 word storage */ /* 16 word storage */
for (y = h; y--; ) { for (y = h; y--;) {
prefetch_load(src + src_stride); prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_load(src + src_stride + 64); prefetch_load(src + src_stride + 64);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
prefetch_store(dst + dst_stride + 32); prefetch_store(dst + dst_stride + 32);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t"
"ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp3], 4(%[src]) \n\t"
@ -652,11 +617,9 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
"adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
"sw %[tn2], 60(%[dst]) \n\t" /* store */ "sw %[tn2], 60(%[dst]) \n\t" /* store */
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp3] "=&r" (tp3), [tp4] "=&r" (tp4), [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2) : [src] "r"(src), [dst] "r"(dst));
: [src] "r" (src), [dst] "r" (dst)
);
src += src_stride; src += src_stride;
dst += dst_stride; dst += dst_stride;

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_avg_horiz_4_dspr2(const uint8_t *src, static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
int32_t vector1b, vector2b, vector3b, vector4b; int32_t vector1b, vector2b, vector3b, vector4b;
@ -45,7 +42,7 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -122,17 +119,15 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
"addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
"sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
[n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
[Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
[vector4a] "r" (vector4a), [src] "r"(src));
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -140,12 +135,9 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
} }
} }
static void convolve_avg_horiz_8_dspr2(const uint8_t *src, static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64; uint32_t vector4a = 64;
@ -167,7 +159,7 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -309,17 +301,15 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
"sb %[tn3], 5(%[dst]) \n\t" "sb %[tn3], 5(%[dst]) \n\t"
"sb %[tn1], 7(%[dst]) \n\t" "sb %[tn1], 7(%[dst]) \n\t"
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
[st0] "=&r" (st0), [st1] "=&r" (st1), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
[n1] "=&r" (n1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
[vector4a] "r" (vector4a), [src] "r"(src));
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -328,11 +318,9 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
} }
static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
int32_t src_stride, int32_t src_stride, uint8_t *dst_ptr,
uint8_t *dst_ptr,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h,
int32_t h,
int32_t count) { int32_t count) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
@ -360,7 +348,7 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride); prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) { for (c = 0; c < count; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -618,16 +606,15 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
[qload3] "=&r" (qload3), [p5] "=&r" (p5), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp3] "=&r"(Temp3)
: [filter12] "r" (filter12), [filter34] "r" (filter34), : [filter12] "r"(filter12), [filter34] "r"(filter34),
[filter56] "r" (filter56), [filter78] "r" (filter78), [filter56] "r"(filter56), [filter78] "r"(filter78),
[vector_64] "r" (vector_64), [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [src] "r"(src));
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -640,11 +627,9 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
} }
static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
int32_t src_stride, int32_t src_stride, uint8_t *dst_ptr,
uint8_t *dst_ptr,
int32_t dst_stride, int32_t dst_stride,
const int16_t *filter_x0, const int16_t *filter_x0, int32_t h) {
int32_t h) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
uint8_t *dst; uint8_t *dst;
@ -673,7 +658,7 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32); prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) { for (c = 0; c < 4; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -931,16 +916,15 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
"sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
[qload3] "=&r" (qload3), [p5] "=&r" (p5), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp3] "=&r"(Temp3)
: [filter12] "r" (filter12), [filter34] "r" (filter34), : [filter12] "r"(filter12), [filter34] "r"(filter34),
[filter56] "r" (filter56), [filter78] "r" (filter78), [filter56] "r"(filter56), [filter78] "r"(filter78),
[vector_64] "r" (vector_64), [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) [src] "r"(src));
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -961,22 +945,17 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
assert(((const int32_t *)filter_x)[1] != 0x800000); assert(((const int32_t *)filter_x)[1] != 0x800000);
if (((const int32_t *)filter_x)[0] == 0) { if (((const int32_t *)filter_x)[0] == 0) {
vpx_convolve2_avg_horiz_dspr2(src, src_stride, vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
} else { } else {
uint32_t pos = 38; uint32_t pos = 38;
src -= 3; src -= 3;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
/* prefetch data to cache memory */ /* prefetch data to cache memory */
prefetch_load(src); prefetch_load(src);
@ -985,39 +964,32 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) { switch (w) {
case 4: case 4:
convolve_avg_horiz_4_dspr2(src, src_stride, convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h);
filter_x, h);
break; break;
case 8: case 8:
convolve_avg_horiz_8_dspr2(src, src_stride, convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h);
filter_x, h);
break; break;
case 16: case 16:
convolve_avg_horiz_16_dspr2(src, src_stride, convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h, 1);
filter_x, h, 1);
break; break;
case 32: case 32:
convolve_avg_horiz_16_dspr2(src, src_stride, convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h, 2);
filter_x, h, 2);
break; break;
case 64: case 64:
prefetch_load(src + 64); prefetch_load(src + 64);
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_avg_horiz_64_dspr2(src, src_stride, convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, h);
filter_x, h);
break; break;
default: default:
vpx_convolve8_avg_horiz_c(src + 3, src_stride, vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w,
filter_x, x_step_q4, h);
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_horiz_4_dspr2(const uint8_t *src, static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
int32_t vector1b, vector2b, vector3b, vector4b; int32_t vector1b, vector2b, vector3b, vector4b;
@ -45,7 +42,7 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -111,17 +108,15 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
"sb %[tp2], 2(%[dst]) \n\t" "sb %[tp2], 2(%[dst]) \n\t"
"sb %[n2], 3(%[dst]) \n\t" "sb %[n2], 3(%[dst]) \n\t"
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
[n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
[Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
[vector4a] "r" (vector4a), [src] "r"(src));
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -129,12 +124,9 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
} }
} }
static void convolve_horiz_8_dspr2(const uint8_t *src, static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y; int32_t y;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
uint32_t vector4a = 64; uint32_t vector4a = 64;
@ -156,7 +148,7 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride);
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t"
@ -275,17 +267,15 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
"sb %[p2], 5(%[dst]) \n\t" "sb %[p2], 5(%[dst]) \n\t"
"sb %[n1], 7(%[dst]) \n\t" "sb %[n1], 7(%[dst]) \n\t"
: [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
[tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
[st0] "=&r" (st0), [st1] "=&r" (st1), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
[n1] "=&r" (n1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
[vector4a] "r" (vector4a), [src] "r"(src));
[cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
);
/* Next row... */ /* Next row... */
src += src_stride; src += src_stride;
@ -293,12 +283,9 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
} }
} }
static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride,
uint8_t *dst_ptr, const int16_t *filter_x0, int32_t h,
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h,
int32_t count) { int32_t count) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
@ -326,7 +313,7 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride); prefetch_store(dst_ptr + dst_stride);
for (c = 0; c < count; c++) { for (c = 0; c < count; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -542,17 +529,15 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p5] "=&r" (p5), [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
: [filter12] "r" (filter12), [filter34] "r" (filter34), : [filter12] "r"(filter12), [filter34] "r"(filter34),
[filter56] "r" (filter56), [filter78] "r" (filter78), [filter56] "r"(filter56), [filter78] "r"(filter78),
[vector_64] "r" (vector_64), [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
[cm] "r" (cm), [dst] "r" (dst), [src] "r"(src));
[src] "r" (src)
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -564,12 +549,9 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
} }
} }
static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride,
uint8_t *dst_ptr, const int16_t *filter_x0, int32_t h) {
int32_t dst_stride,
const int16_t *filter_x0,
int32_t h) {
int32_t y, c; int32_t y, c;
const uint8_t *src; const uint8_t *src;
uint8_t *dst; uint8_t *dst;
@ -598,7 +580,7 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
prefetch_store(dst_ptr + dst_stride + 32); prefetch_store(dst_ptr + dst_stride + 32);
for (c = 0; c < 4; c++) { for (c = 0; c < 4; c++) {
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload1], 0(%[src]) \n\t"
"ulw %[qload2], 4(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t"
@ -814,17 +796,15 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
"sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
"sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
: [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
[st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
[p5] "=&r" (p5), [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
: [filter12] "r" (filter12), [filter34] "r" (filter34), : [filter12] "r"(filter12), [filter34] "r"(filter34),
[filter56] "r" (filter56), [filter78] "r" (filter78), [filter56] "r"(filter56), [filter78] "r"(filter78),
[vector_64] "r" (vector_64), [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
[cm] "r" (cm), [dst] "r" (dst), [src] "r"(src));
[src] "r" (src)
);
src += 16; src += 16;
dst += 16; dst += 16;
@ -839,17 +819,14 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000); assert(((const int32_t *)filter_x)[1] != 0x800000);
if (((const int32_t *)filter_x)[0] == 0) { if (((const int32_t *)filter_x)[0] == 0) {
vpx_convolve2_horiz_dspr2(src, src_stride, vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
} else { } else {
uint32_t pos = 38; uint32_t pos = 38;
@ -857,11 +834,9 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
src -= 3; src -= 3;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
/* prefetch data to cache memory */ /* prefetch data to cache memory */
prefetch_load(src); prefetch_load(src);
@ -870,39 +845,31 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
switch (w) { switch (w) {
case 4: case 4:
convolve_horiz_4_dspr2(src, (int32_t)src_stride, convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h);
filter_x, (int32_t)h);
break; break;
case 8: case 8:
convolve_horiz_8_dspr2(src, (int32_t)src_stride, convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h);
filter_x, (int32_t)h);
break; break;
case 16: case 16:
convolve_horiz_16_dspr2(src, (int32_t)src_stride, convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h, 1);
filter_x, (int32_t)h, 1);
break; break;
case 32: case 32:
convolve_horiz_16_dspr2(src, (int32_t)src_stride, convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h, 2);
filter_x, (int32_t)h, 2);
break; break;
case 64: case 64:
prefetch_load(src + 64); prefetch_load(src + 64);
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_horiz_64_dspr2(src, (int32_t)src_stride, convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filter_x, (int32_t)h);
filter_x, (int32_t)h);
break; break;
default: default:
vpx_convolve8_horiz_c(src + 3, src_stride, vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

View File

@ -18,12 +18,9 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void convolve_vert_4_dspr2(const uint8_t *src, static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t w,
int32_t dst_stride,
const int16_t *filter_y,
int32_t w,
int32_t h) { int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
@ -53,7 +50,7 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -152,19 +149,16 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
[n1] "=&r" (n1), [n2] "=&r" (n2), [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
[scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[store1] "=&r" (store1), [store2] "=&r" (store2), : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
[src_ptr] "+r" (src_ptr) [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
[vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
[cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -173,12 +167,9 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
} }
} }
static void convolve_vert_64_dspr2(const uint8_t *src, static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, const int16_t *filter_y, int32_t h) {
int32_t dst_stride,
const int16_t *filter_y,
int32_t h) {
int32_t x, y; int32_t x, y;
const uint8_t *src_ptr; const uint8_t *src_ptr;
uint8_t *dst_ptr; uint8_t *dst_ptr;
@ -208,7 +199,7 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
src_ptr = src + x; src_ptr = src + x;
dst_ptr = dst + x; dst_ptr = dst + x;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[load1], 0(%[src_ptr]) \n\t" "ulw %[load1], 0(%[src_ptr]) \n\t"
"add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
"ulw %[load2], 0(%[src_ptr]) \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t"
@ -307,19 +298,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
"sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store1], 2(%[dst_ptr]) \n\t"
"sb %[store2], 3(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
[p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
[n1] "=&r" (n1), [n2] "=&r" (n2), [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
[scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
[Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
[store1] "=&r" (store1), [store2] "=&r" (store2), : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
[src_ptr] "+r" (src_ptr) [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
: [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
[vector3b] "r" (vector3b), [vector4b] "r" (vector4b), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
[vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
[cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
);
} }
/* Next row... */ /* Next row... */
@ -331,50 +319,38 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000); assert(((const int32_t *)filter_y)[1] != 0x800000);
if (((const int32_t *)filter_y)[0] == 0) { if (((const int32_t *)filter_y)[0] == 0) {
vpx_convolve2_vert_dspr2(src, src_stride, vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
} else { } else {
uint32_t pos = 38; uint32_t pos = 38;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
prefetch_store(dst); prefetch_store(dst);
switch (w) { switch (w) {
case 4 : case 4:
case 8 : case 8:
case 16 : case 16:
case 32 : case 32:
convolve_vert_4_dspr2(src, src_stride, convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
dst, dst_stride,
filter_y, w, h);
break; break;
case 64 : case 64:
prefetch_store(dst + 32); prefetch_store(dst + 32);
convolve_vert_64_dspr2(src, src_stride, convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
dst, dst_stride,
filter_y, h);
break; break;
default: default:
vpx_convolve8_vert_c(src, src_stride, vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
dst, dst_stride, x_step_q4, filter_y, y_step_q4, w, h);
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
break; break;
} }
} }

View File

@ -25,8 +25,8 @@ extern "C" {
void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h); int h);
void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
@ -37,19 +37,18 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h); int h);
void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
uint8_t *dst, ptrdiff_t dst_stride, ptrdiff_t dst_stride, const int16_t *filter, int w,
const int16_t *filter, int h);
int w, int h);
void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h); int h);
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -13,23 +13,22 @@
extern const int16_t vpx_rv[]; extern const int16_t vpx_rv[];
#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ #define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \
out0, out1, out2, out3, \ out1, out2, out3, out4, out5, out6, out7, \
out4, out5, out6, out7, \ out8, out9, out10, out11, out12, out13, out14, \
out8, out9, out10, out11, \ out15) \
out12, out13, out14, out15) \ { \
{ \
v8i16 temp0, temp1, temp2, temp3, temp4; \ v8i16 temp0, temp1, temp2, temp3, temp4; \
v8i16 temp5, temp6, temp7, temp8, temp9; \ v8i16 temp5, temp6, temp7, temp8, temp9; \
\ \
ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
temp0, temp1, temp2, temp3); \ temp3); \
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
temp0, temp1, temp2, temp3); \ temp3); \
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
ILVRL_W2_UB(temp5, temp4, out8, out10); \ ILVRL_W2_UB(temp5, temp4, out8, out10); \
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
@ -46,11 +45,11 @@ extern const int16_t vpx_rv[];
out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
} }
#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \ #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
below1_in, below2_in, ref, out) \ ref, out) \
{ \ { \
v16u8 temp0, temp1; \ v16u8 temp0, temp1; \
\ \
temp1 = __msa_aver_u_b(above2_in, above1_in); \ temp1 = __msa_aver_u_b(above2_in, above1_in); \
@ -69,11 +68,11 @@ extern const int16_t vpx_rv[];
temp1 = (temp1 < ref); \ temp1 = (temp1 < ref); \
temp0 = temp0 & temp1; \ temp0 = temp0 & temp1; \
out = __msa_bmz_v(out, src_in, temp0); \ out = __msa_bmz_v(out, src_in, temp0); \
} }
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \ #define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
in8, in9, in10, in11, in12, in13, in14, in15) \ in10, in11, in12, in13, in14, in15) \
{ \ { \
v8i16 temp0, temp1, temp2, temp3, temp4; \ v8i16 temp0, temp1, temp2, temp3, temp4; \
v8i16 temp5, temp6, temp7, temp8, temp9; \ v8i16 temp5, temp6, temp7, temp8, temp9; \
\ \
@ -98,21 +97,21 @@ extern const int16_t vpx_rv[];
ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \ ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \
temp2, temp3, temp4, temp5); \ temp4, temp5); \
ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \ ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
temp6, temp7, temp8, temp9); \ temp7, temp8, temp9); \
ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
} }
#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \ #define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
in6, in7, in8, in9, in10, in11) \ in9, in10, in11) \
{ \ { \
v8i16 temp0, temp1, temp2, temp3; \ v8i16 temp0, temp1, temp2, temp3; \
v8i16 temp4, temp5, temp6, temp7; \ v8i16 temp4, temp5, temp6, temp7; \
\ \
@ -139,7 +138,7 @@ extern const int16_t vpx_rv[];
in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
} }
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
int32_t src_stride, int32_t src_stride,
@ -203,16 +202,16 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
above1 = LD_UB(p_src + 9 * src_stride); above1 = LD_UB(p_src + 9 * src_stride);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
out0 = __msa_copy_u_d((v2i64) inter0, 0); out0 = __msa_copy_u_d((v2i64)inter0, 0);
out1 = __msa_copy_u_d((v2i64) inter1, 0); out1 = __msa_copy_u_d((v2i64)inter1, 0);
out2 = __msa_copy_u_d((v2i64) inter2, 0); out2 = __msa_copy_u_d((v2i64)inter2, 0);
out3 = __msa_copy_u_d((v2i64) inter3, 0); out3 = __msa_copy_u_d((v2i64)inter3, 0);
SD4(out0, out1, out2, out3, p_dst, dst_stride); SD4(out0, out1, out2, out3, p_dst, dst_stride);
out0 = __msa_copy_u_d((v2i64) inter4, 0); out0 = __msa_copy_u_d((v2i64)inter4, 0);
out1 = __msa_copy_u_d((v2i64) inter5, 0); out1 = __msa_copy_u_d((v2i64)inter5, 0);
out2 = __msa_copy_u_d((v2i64) inter6, 0); out2 = __msa_copy_u_d((v2i64)inter6, 0);
out3 = __msa_copy_u_d((v2i64) inter7, 0); out3 = __msa_copy_u_d((v2i64)inter7, 0);
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
} }
@ -236,36 +235,36 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
src = inter2; src = inter2;
below1 = inter3; below1 = inter3;
below2 = inter4; below2 = inter4;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
above2 = inter5; above2 = inter5;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
above1 = inter6; above1 = inter6;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
src = inter7; src = inter7;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
below1 = inter8; below1 = inter8;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
below2 = inter9; below2 = inter9;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
if (col == (cols / 8 - 1)) { if (col == (cols / 8 - 1)) {
above2 = inter9; above2 = inter9;
} else { } else {
above2 = inter10; above2 = inter10;
} }
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
if (col == (cols / 8 - 1)) { if (col == (cols / 8 - 1)) {
above1 = inter9; above1 = inter9;
} else { } else {
above1 = inter11; above1 = inter11;
} }
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8, TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
inter9, inter2, inter3, inter4, inter5, inter6, inter7, inter9, inter2, inter3, inter4, inter5, inter6, inter7,
@ -371,36 +370,36 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
src = inter2; src = inter2;
below1 = inter3; below1 = inter3;
below2 = inter4; below2 = inter4;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
above2 = inter5; above2 = inter5;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
above1 = inter6; above1 = inter6;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
src = inter7; src = inter7;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
below1 = inter8; below1 = inter8;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
below2 = inter9; below2 = inter9;
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
if (col == (cols / 8 - 1)) { if (col == (cols / 8 - 1)) {
above2 = inter9; above2 = inter9;
} else { } else {
above2 = inter10; above2 = inter10;
} }
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
if (col == (cols / 8 - 1)) { if (col == (cols / 8 - 1)) {
above1 = inter9; above1 = inter9;
} else { } else {
above1 = inter11; above1 = inter11;
} }
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7); ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
inter8, inter9, inter2, inter3, inter4, inter5, inter8, inter9, inter2, inter3, inter4, inter5,
@ -452,8 +451,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
int32_t row, col, cnt; int32_t row, col, cnt;
uint8_t *src_dup = src_ptr; uint8_t *src_dup = src_ptr;
v16u8 src0, src, tmp_orig; v16u8 src0, src, tmp_orig;
v16u8 tmp = {0}; v16u8 tmp = { 0 };
v16i8 zero = {0}; v16i8 zero = { 0 };
v8u16 sum_h, src_r_h, src_l_h; v8u16 sum_h, src_r_h, src_l_h;
v4u32 src_r_w, src_l_w; v4u32 src_r_w, src_l_w;
v4i32 flimit_vec; v4i32 flimit_vec;
@ -462,13 +461,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
for (row = rows; row--;) { for (row = rows; row--;) {
int32_t sum_sq = 0; int32_t sum_sq = 0;
int32_t sum = 0; int32_t sum = 0;
src0 = (v16u8) __msa_fill_b(src_dup[0]); src0 = (v16u8)__msa_fill_b(src_dup[0]);
ST8x1_UB(src0, (src_dup - 8)); ST8x1_UB(src0, (src_dup - 8));
src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]); src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
ST_UB(src0, src_dup + cols); ST_UB(src0, src_dup + cols);
src_dup[cols + 16] = src_dup[cols - 1]; src_dup[cols + 16] = src_dup[cols - 1];
tmp_orig = (v16u8) __msa_ldi_b(0); tmp_orig = (v16u8)__msa_ldi_b(0);
tmp_orig[15] = tmp[15]; tmp_orig[15] = tmp[15];
src = LD_UB(src_dup - 8); src = LD_UB(src_dup - 8);
src[15] = 0; src[15] = 0;
@ -508,9 +507,9 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
sum = sum_l[7]; sum = sum_l[7];
src = LD_UB(src_dup + 16 * col); src = LD_UB(src_dup + 16 * col);
ILVRL_B2_UH(zero, src, src_r_h, src_l_h); ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4); src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4); src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7); tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
HADD_UB2_UH(src_r, src_l, add_r, add_l); HADD_UB2_UH(src_r, src_l, add_r, add_l);
UNPCK_SH_SW(sub_r, sub0, sub1); UNPCK_SH_SW(sub_r, sub0, sub1);
@ -552,13 +551,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
total2 = (total2 < flimit_vec); total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec); total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0); mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
tmp = __msa_bmz_v(tmp, src, (v16u8) mask); tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
if (col == 0) { if (col == 0) {
uint64_t src_d; uint64_t src_d;
src_d = __msa_copy_u_d((v2i64) tmp_orig, 1); src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
SD(src_d, (src_dup - 8)); SD(src_d, (src_dup - 8));
} }
@ -588,15 +587,15 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
for (col = 0; col < (cols >> 4); ++col) { for (col = 0; col < (cols >> 4); ++col) {
uint8_t *dst_tmp = &dst_ptr[col << 4]; uint8_t *dst_tmp = &dst_ptr[col << 4];
v16u8 dst; v16u8 dst;
v16i8 zero = {0}; v16i8 zero = { 0 };
v16u8 tmp[16]; v16u8 tmp[16];
v8i16 mult0, mult1, rv2_0, rv2_1; v8i16 mult0, mult1, rv2_0, rv2_1;
v8i16 sum0_h = {0}; v8i16 sum0_h = { 0 };
v8i16 sum1_h = {0}; v8i16 sum1_h = { 0 };
v4i32 mul0 = {0}; v4i32 mul0 = { 0 };
v4i32 mul1 = {0}; v4i32 mul1 = { 0 };
v4i32 mul2 = {0}; v4i32 mul2 = { 0 };
v4i32 mul3 = {0}; v4i32 mul3 = { 0 };
v4i32 sum0_w, sum1_w, sum2_w, sum3_w; v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
v4i32 add0, add1, add2, add3; v4i32 add0, add1, add2, add3;
const int16_t *rv2[16]; const int16_t *rv2[16];
@ -618,10 +617,10 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
dst = LD_UB(dst_tmp + (cnt * pitch)); dst = LD_UB(dst_tmp + (cnt * pitch));
UNPCK_UB_SH(dst, dst_r_h, dst_l_h); UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1); MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0); mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0); mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1); mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1); mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h); ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
} }
@ -652,7 +651,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h); ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4); dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4); dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7); tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w); UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w); UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
@ -669,8 +668,8 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
total2 = (total2 < flimit_vec); total2 = (total2 < flimit_vec);
total3 = (total3 < flimit_vec); total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0); mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask); tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
if (row >= 8) { if (row >= 8) {
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch)); ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));

View File

@ -27,10 +27,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in4, in5, in6, in7, 2);
SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
step0, step1, step2, step3, in4, in5, in6, in7); step3, in4, in5, in6, in7);
BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
ST_SH4(step0, step1, step2, step3, temp_buff, 8); ST_SH4(step0, step1, step2, step3, temp_buff, 8);
ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
@ -45,10 +45,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in4, in5, in6, in7, 2);
SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
step0, step1, step2, step3, in4, in5, in6, in7); step3, in4, in5, in6, in7);
BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
@ -64,12 +64,12 @@ static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
/* fdct even */ /* fdct even */
LD_SH4(input, 8, in0, in1, in2, in3); LD_SH4(input, 8, in0, in1, in2, in3);
LD_SH4(input + 96, 8, in12, in13, in14, in15); LD_SH4(input + 96, 8, in12, in13, in14, in15);
BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
vec0, vec1, vec2, vec3, in12, in13, in14, in15); vec3, in12, in13, in14, in15);
LD_SH4(input + 32, 8, in4, in5, in6, in7); LD_SH4(input + 32, 8, in4, in5, in6, in7);
LD_SH4(input + 64, 8, in8, in9, in10, in11); LD_SH4(input + 64, 8, in8, in9, in10, in11);
BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
vec4, vec5, vec6, vec7, in8, in9, in10, in11); in8, in9, in10, in11);
/* Stage 3 */ /* Stage 3 */
ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
@ -258,28 +258,26 @@ static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
in8, in9, in10, in11, in12, in13, in14, in15); in10, in11, in12, in13, in14, in15);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
in8, in9, in10, in11, in12, in13, in14, in15, in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
step0, step1, step2, step3, step4, step5, step6, step7, step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
/* 2nd set */ /* 2nd set */
LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
in8, in9, in10, in11, in12, in13, in14, in15); in10, in11, in12, in13, in14, in15);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
in8, in9, in10, in11, in12, in13, in14, in15, in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
step0, step1, step2, step3, step4, step5, step6, step7, step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
(output + 8 * 8), 8); (output + 8 * 8), 8);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
@ -299,10 +297,9 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
in8, in9, in10, in11, in12, in13, in14, in15, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec7, in8, in9, in10, in11, in12, in13, in14, in15);
in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
@ -315,19 +312,19 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r);
UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r);
UNPCK_SH_SW(vec7, vec7_l, vec7_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r);
ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
tmp0_w, tmp1_w, tmp2_w, tmp3_w); tmp1_w, tmp2_w, tmp3_w);
BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
vec0_r, vec1_r, vec2_r, vec3_r); vec1_r, vec2_r, vec3_r);
tmp3_w = vec0_r + vec3_r; tmp3_w = vec0_r + vec3_r;
vec0_r = vec0_r - vec3_r; vec0_r = vec0_r - vec3_r;
vec3_r = vec1_r + vec2_r; vec3_r = vec1_r + vec2_r;
vec1_r = vec1_r - vec2_r; vec1_r = vec1_r - vec2_r;
DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); vec4_r, tmp3_w, vec6_r, vec3_r);
FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(vec4_r);
FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(tmp3_w);
FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec6_r);
@ -335,8 +332,8 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
ST_SH2(vec5, vec4, out, 8); ST_SH2(vec5, vec4, out, 8);
DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); vec4_r, tmp3_w, vec6_r, vec3_r);
FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(vec4_r);
FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(tmp3_w);
FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec6_r);
@ -401,10 +398,9 @@ static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
in8, in9, in10, in11, in12, in13, in14, in15, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec7, in8, in9, in10, in11, in12, in13, in14, in15);
in8, in9, in10, in11, in12, in13, in14, in15);
/* Stage 3 */ /* Stage 3 */
ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
@ -610,8 +606,8 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
in3 = LD_SH(temp + 192); in3 = LD_SH(temp + 192);
in5 = LD_SH(temp + 216); in5 = LD_SH(temp + 216);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
/* 2nd set */ /* 2nd set */
in0_1 = LD_SH(temp + 16); in0_1 = LD_SH(temp + 16);
@ -637,10 +633,10 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
in6 = LD_SH(temp + 104); in6 = LD_SH(temp + 104);
in7 = LD_SH(temp + 144); in7 = LD_SH(temp + 144);
ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
output + 8, 32); 32);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
/* 4th set */ /* 4th set */
@ -655,12 +651,11 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
output + 24, 32); 32);
} }
static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
int16_t *output) {
fdct8x32_1d_row_load_butterfly(temp, temp_buf); fdct8x32_1d_row_load_butterfly(temp, temp_buf);
fdct8x32_1d_row_even(temp_buf, temp_buf); fdct8x32_1d_row_even(temp_buf, temp_buf);
fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
@ -706,10 +701,9 @@ static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
in8, in9, in10, in11, in12, in13, in14, in15, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec7, in8, in9, in10, in11, in12, in13, in14, in15);
in8, in9, in10, in11, in12, in13, in14, in15);
FDCT_POSTPROC_2V_NEG_H(vec0, vec1); FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
FDCT_POSTPROC_2V_NEG_H(vec2, vec3); FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
FDCT_POSTPROC_2V_NEG_H(vec4, vec5); FDCT_POSTPROC_2V_NEG_H(vec4, vec5);

View File

@ -22,20 +22,20 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
-cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, v8i16 coeff2 = {
0, 0, 0, 0 }; -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
};
LD_SH16(input, src_stride, LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
in0, in1, in2, in3, in4, in5, in6, in7, in10, in11, in12, in13, in14, in15);
in8, in9, in10, in11, in12, in13, in14, in15);
SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in0, in1, in2, in3, 2);
SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in4, in5, in6, in7, 2);
SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in8, in9, in10, in11, 2);
SLLI_4V(in12, in13, in14, in15, 2); SLLI_4V(in12, in13, in14, in15, 2);
ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
@ -137,10 +137,10 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
in8, in9, in10, in11, in12, in13, in14, in15); in10, in11, in12, in13, in14, in15);
ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
@ -150,19 +150,19 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
SRA_4V(in8, in9, in10, in11, 2); SRA_4V(in8, in9, in10, in11, 2);
SRA_4V(in12, in13, in14, in15, 2); SRA_4V(in12, in13, in14, in15, 2);
BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); tmp1, in1, tmp2, in2, tmp3, in3);
ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); tmp5, in5, tmp6, in6, tmp7, in7);
ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
} }
@ -203,14 +203,14 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in0, in1, in2, in3, 2);
SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in4, in5, in6, in7, 2);
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
in0, in1, in2, in3, in4, in5, in6, in7); in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
in0, in1, in2, in3, in4, in5, in6, in7); in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
} }

View File

@ -14,27 +14,30 @@
#include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h"
#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/txfm_common.h"
#define LD_HADD(psrc, stride) ({ \ #define LD_HADD(psrc, stride) \
({ \
v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
v4i32 vec_w_m; \ v4i32 vec_w_m; \
\ \
LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \ ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
in4_m, in6_m, in0_m, in4_m); \ in0_m, in4_m); \
in0_m += in4_m; \ in0_m += in4_m; \
\ \
vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
HADD_SW_S32(vec_w_m); \ HADD_SW_S32(vec_w_m); \
}) })
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \ #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ v8i16 coeff_m = { \
cospi_24_64, -cospi_8_64, 0, 0, 0 }; \ cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
}; \
\ \
BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
@ -52,32 +55,33 @@
vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \
\ \
SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \
PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, \ PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \
vec7_m, vec7_m, out0, out2, out1, out3); \ vec7_m, out0, out2, out1, out3); \
} }
#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) { \ #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
{ \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
\ \
SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \
SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \
AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, \ AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
in0, in1, in2, in3); \ in2, in3); \
AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, \ AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
in4, in5, in6, in7); \ in6, in7); \
} }
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
out0, out1, out2, out3, out4, out5, out6, out7) { \ out3, out4, out5, out6, out7) \
{ \
v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \
v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
cospi_24_64, cospi_4_64, cospi_28_64, \ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
cospi_12_64, cospi_20_64 }; \
\ \
/* FDCT stage1 */ \ /* FDCT stage1 */ \
BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ s3_m, s4_m, s5_m, s6_m, s7_m); \
BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
@ -125,18 +129,19 @@
x3_m = -x3_m; \ x3_m = -x3_m; \
x2_m = __msa_ilvev_h(x2_m, x3_m); \ x2_m = __msa_ilvev_h(x2_m, x3_m); \
out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
} }
#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, \ #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out0, out1, out2, out3, out4, out5, out6, out7) { \ out2, out3, out4, out5, out6, out7) \
{ \
v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
v8i16 x0_m, x1_m, x2_m, x3_m; \ v8i16 x0_m, x1_m, x2_m, x3_m; \
v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
\ \
/* FDCT stage1 */ \ /* FDCT stage1 */ \
BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \
s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ s3_m, s4_m, s5_m, s6_m, s7_m); \
BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
@ -184,25 +189,24 @@
x3_m = -x3_m; \ x3_m = -x3_m; \
x2_m = __msa_ilvev_h(x2_m, x3_m); \ x2_m = __msa_ilvev_h(x2_m, x3_m); \
out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
} }
#define FDCT8x16_ODD(input0, input1, input2, input3, \ #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
input4, input5, input6, input7, \ input7, out1, out3, out5, out7, out9, out11, out13, \
out1, out3, out5, out7, \ out15) \
out9, out11, out13, out15) { \ { \
v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
cospi_24_64, -cospi_8_64, -cospi_24_64, \ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \
cospi_12_64, cospi_20_64 }; \ v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \
v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, \ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \
cospi_18_64, cospi_10_64, cospi_22_64, \ v8i16 coeff2_m = { \
cospi_6_64, cospi_26_64 }; \ -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \
v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64, \ }; \
-cospi_26_64, 0, 0, 0, 0 }; \
\ \
/* stp 1 */ \ /* stp 1 */ \
ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
@ -218,10 +222,10 @@
stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
\ \
/* stp2 */ \ /* stp2 */ \
BUTTERFLY_4(input0, input1, stp22_m, stp23_m, \ BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \
stp30_m, stp31_m, stp32_m, stp33_m); \ stp33_m); \
BUTTERFLY_4(input7, input6, stp25_m, stp24_m, \ BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \
stp37_m, stp36_m, stp35_m, stp34_m); \ stp34_m); \
\ \
ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
@ -243,10 +247,10 @@
stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
\ \
/* stp4 */ \ /* stp4 */ \
BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, \ BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \
vec6_m, vec2_m, vec4_m, vec5_m); \ vec5_m); \
BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, \ BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
stp21_m, stp23_m, stp24_m, stp31_m); \ stp31_m); \
\ \
ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
@ -286,9 +290,10 @@
cnst1_m = __msa_splati_h(coeff2_m, 3); \ cnst1_m = __msa_splati_h(coeff2_m, 3); \
cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
} }
#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \ #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
{ \
v8i16 tp0_m, tp1_m; \ v8i16 tp0_m, tp1_m; \
v8i16 one_m = __msa_ldi_h(1); \ v8i16 one_m = __msa_ldi_h(1); \
\ \
@ -302,9 +307,10 @@
vec1 += tp1_m; \ vec1 += tp1_m; \
vec0 >>= 2; \ vec0 >>= 2; \
vec1 >>= 2; \ vec1 >>= 2; \
} }
#define FDCT32_POSTPROC_NEG_W(vec) { \ #define FDCT32_POSTPROC_NEG_W(vec) \
{ \
v4i32 temp_m; \ v4i32 temp_m; \
v4i32 one_m = __msa_ldi_w(1); \ v4i32 one_m = __msa_ldi_w(1); \
\ \
@ -313,9 +319,10 @@
temp_m = one_m & temp_m; \ temp_m = one_m & temp_m; \
vec += temp_m; \ vec += temp_m; \
vec >>= 2; \ vec >>= 2; \
} }
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \ #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
{ \
v8i16 tp0_m, tp1_m; \ v8i16 tp0_m, tp1_m; \
v8i16 one = __msa_ldi_h(1); \ v8i16 one = __msa_ldi_h(1); \
\ \
@ -331,16 +338,16 @@
vec1 += tp1_m; \ vec1 += tp1_m; \
vec0 >>= 2; \ vec0 >>= 2; \
vec1 >>= 2; \ vec1 >>= 2; \
} }
#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \ #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
reg1_right, const0, const1, \ const0, const1, out0, out1, out2, out3) \
out0, out1, out2, out3) { \ { \
v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
v4i32 k0_m = __msa_fill_w((int32_t) const0); \ v4i32 k0_m = __msa_fill_w((int32_t)const0); \
\ \
s0_m = __msa_fill_w((int32_t) const1); \ s0_m = __msa_fill_w((int32_t)const1); \
k0_m = __msa_ilvev_w(s0_m, k0_m); \ k0_m = __msa_ilvev_w(s0_m, k0_m); \
\ \
ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
@ -365,7 +372,7 @@
tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
} }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride); int32_t src_stride);

View File

@ -20,10 +20,10 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
input += 8; input += 8;
LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); reg2, reg3, reg4, reg5, reg6, reg7);
TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); reg9, reg10, reg11, reg12, reg13, reg14, reg15);
DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
@ -93,13 +93,13 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
reg3 = tmp7; reg3 = tmp7;
/* transpose block */ /* transpose block */
TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); reg2, reg4, reg6, reg8, reg10, reg12, reg14);
ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
/* transpose block */ /* transpose block */
TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); reg13, reg11, reg5, reg7, reg9, reg1, reg15);
ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
} }
@ -233,7 +233,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
/* short case just considers top 4 rows as valid output */ /* short case just considers top 4 rows as valid output */
out += 4 * 16; out += 4 * 16;
for (i = 12; i--;) { for (i = 12; i--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sw $zero, 0(%[out]) \n\t" "sw $zero, 0(%[out]) \n\t"
"sw $zero, 4(%[out]) \n\t" "sw $zero, 4(%[out]) \n\t"
"sw $zero, 8(%[out]) \n\t" "sw $zero, 8(%[out]) \n\t"
@ -244,8 +244,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
"sw $zero, 28(%[out]) \n\t" "sw $zero, 28(%[out]) \n\t"
: :
: [out] "r" (out) : [out] "r"(out));
);
out += 16; out += 16;
} }
@ -283,8 +282,8 @@ void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
CLIP_SH4_0_255(res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3);
CLIP_SH4_0_255(res4, res5, res6, res7); CLIP_SH4_0_255(res4, res5, res6, res7);
PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
tmp0, tmp1, tmp2, tmp3); tmp2, tmp3);
ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
@ -295,29 +294,28 @@ void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
/* load input data */ /* load input data */
LD_SH16(input, 8, LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); l7, l15);
TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
l0, l1, l2, l3, l4, l5, l6, l7); l7);
TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
l8, l9, l10, l11, l12, l13, l14, l15); l12, l13, l14, l15);
/* ADST in horizontal */ /* ADST in horizontal */
VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
l8, l9, l10, l11, l12, l13, l14, l15, l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
r0, r1, r2, r3, r4, r5, r6, r7, r12, r13, r14, r15);
r8, r9, r10, r11, r12, r13, r14, r15);
l1 = -r8; l1 = -r8;
l3 = -r4; l3 = -r4;
l13 = -r13; l13 = -r13;
l15 = -r1; l15 = -r1;
TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
l0, l1, l2, l3, l4, l5, l6, l7); l6, l7);
ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
l8, l9, l10, l11, l12, l13, l14, l15); l13, l14, l15);
ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
} }

View File

@ -17,10 +17,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
/* 1st & 2nd 8x8 */ /* 1st & 2nd 8x8 */
LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
m0, n0, m1, n1, m2, n2, m3, n3); n3);
TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
m4, n4, m5, n5, m6, n6, m7, n7); n7);
ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
@ -28,10 +28,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
/* 3rd & 4th 8x8 */ /* 3rd & 4th 8x8 */
LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
m0, n0, m1, n1, m2, n2, m3, n3); n3);
TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
m4, n4, m5, n5, m6, n6, m7, n7); n7);
ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
@ -186,8 +186,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
/* 4 Stores */ /* 4 Stores */
SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
vec0, vec1, vec2, vec3);
DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
@ -198,8 +197,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
/* 4 Stores */ /* 4 Stores */
ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
vec1, vec2, vec0, vec3);
BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
ST_SH(reg0, (tmp_odd_buf + 13 * 8)); ST_SH(reg0, (tmp_odd_buf + 13 * 8));
ST_SH(reg1, (tmp_odd_buf + 14 * 8)); ST_SH(reg1, (tmp_odd_buf + 14 * 8));
@ -213,8 +211,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
loc0, loc1, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
SUB2(reg0, reg4, reg1, reg5, vec0, vec1); SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
@ -228,8 +225,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
loc0, loc1, loc2, loc3);
ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
SUB2(reg0, reg4, reg3, reg7, vec0, vec1); SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
@ -242,8 +238,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
static void idct_butterfly_transpose_store(int16_t *tmp_buf, static void idct_butterfly_transpose_store(int16_t *tmp_buf,
int16_t *tmp_eve_buf, int16_t *tmp_eve_buf,
int16_t *tmp_odd_buf, int16_t *tmp_odd_buf, int16_t *dst) {
int16_t *dst) {
v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
@ -317,26 +312,26 @@ static void idct_butterfly_transpose_store(int16_t *tmp_buf,
/* Transpose : 16 vectors */ /* Transpose : 16 vectors */
/* 1st & 2nd 8x8 */ /* 1st & 2nd 8x8 */
TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
m0, n0, m1, n1, m2, n2, m3, n3); n3);
ST_SH4(m0, n0, m1, n1, (dst + 0), 32); ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
m4, n4, m5, n5, m6, n6, m7, n7); n7);
ST_SH4(m4, n4, m5, n5, (dst + 8), 32); ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
/* 3rd & 4th 8x8 */ /* 3rd & 4th 8x8 */
LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
m0, n0, m1, n1, m2, n2, m3, n3); n3);
ST_SH4(m0, n0, m1, n1, (dst + 16), 32); ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
m4, n4, m5, n5, m6, n6, m7, n7); n7);
ST_SH4(m4, n4, m5, n5, (dst + 24), 32); ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
} }
@ -349,8 +344,8 @@ static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
idct32x8_row_transpose_store(input, &tmp_buf[0]); idct32x8_row_transpose_store(input, &tmp_buf[0]);
idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
&tmp_odd_buf[0], output); output);
} }
static void idct8x32_column_even_process_store(int16_t *tmp_buf, static void idct8x32_column_even_process_store(int16_t *tmp_buf,
@ -541,8 +536,7 @@ static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
} }
static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
int16_t *tmp_odd_buf, int16_t *tmp_odd_buf, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride) { int32_t dst_stride) {
v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
@ -563,8 +557,8 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
SRARI_H4_SH(m0, m2, m4, m6, 6); SRARI_H4_SH(m0, m2, m4, m6, 6);
VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
m0, m2, m4, m6); m6);
/* Load 8 & Store 8 */ /* Load 8 & Store 8 */
vec0 = LD_SH(tmp_odd_buf + 4 * 8); vec0 = LD_SH(tmp_odd_buf + 4 * 8);
@ -578,13 +572,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
SRARI_H4_SH(m1, m3, m5, m7, 6); SRARI_H4_SH(m1, m3, m5, m7, 6);
VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
m1, m3, m5, m7);
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
SRARI_H4_SH(m1, m3, m5, m7, 6); SRARI_H4_SH(m1, m3, m5, m7, 6);
VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
m1, m3, m5, m7); m7);
/* Load 8 & Store 8 */ /* Load 8 & Store 8 */
vec0 = LD_SH(tmp_odd_buf + 2 * 8); vec0 = LD_SH(tmp_odd_buf + 2 * 8);
@ -598,13 +591,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
SRARI_H4_SH(n0, n2, n4, n6, 6); SRARI_H4_SH(n0, n2, n4, n6, 6);
VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
n0, n2, n4, n6);
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
SRARI_H4_SH(n0, n2, n4, n6, 6); SRARI_H4_SH(n0, n2, n4, n6, 6);
VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
n0, n2, n4, n6); n6);
/* Load 8 & Store 8 */ /* Load 8 & Store 8 */
vec0 = LD_SH(tmp_odd_buf + 5 * 8); vec0 = LD_SH(tmp_odd_buf + 5 * 8);
@ -618,13 +610,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
SRARI_H4_SH(n1, n3, n5, n7, 6); SRARI_H4_SH(n1, n3, n5, n7, 6);
VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
n1, n3, n5, n7);
SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
SRARI_H4_SH(n1, n3, n5, n7, 6); SRARI_H4_SH(n1, n3, n5, n7, 6);
VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
n1, n3, n5, n7); n7);
} }
static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
@ -634,8 +625,8 @@ static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
dst, dst_stride); dst_stride);
} }
void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
@ -665,7 +656,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
int16_t *out_ptr = out_arr; int16_t *out_ptr = out_arr;
for (i = 32; i--;) { for (i = 32; i--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sw $zero, 0(%[out_ptr]) \n\t" "sw $zero, 0(%[out_ptr]) \n\t"
"sw $zero, 4(%[out_ptr]) \n\t" "sw $zero, 4(%[out_ptr]) \n\t"
"sw $zero, 8(%[out_ptr]) \n\t" "sw $zero, 8(%[out_ptr]) \n\t"
@ -684,8 +675,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
"sw $zero, 60(%[out_ptr]) \n\t" "sw $zero, 60(%[out_ptr]) \n\t"
: :
: [out_ptr] "r" (out_ptr) : [out_ptr] "r"(out_ptr));
);
out_ptr += 32; out_ptr += 32;
} }
@ -728,8 +718,8 @@ void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
CLIP_SH4_0_255(res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3);
CLIP_SH4_0_255(res4, res5, res6, res7); CLIP_SH4_0_255(res4, res5, res6, res7);
PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
tmp0, tmp1, tmp2, tmp3); tmp2, tmp3);
ST_UB2(tmp0, tmp1, dst, 16); ST_UB2(tmp0, tmp1, dst, 16);
dst += dst_stride; dst += dst_stride;

View File

@ -42,8 +42,8 @@ void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
in0_r -= in3_r; in0_r -= in3_r;
in2_r += in1_r; in2_r += in1_r;
PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
in0, in1, in2, in3); in2, in3);
ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
} }

View File

@ -18,17 +18,17 @@ void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
/* rows transform */ /* rows transform */
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
/* 1D idct8x8 */ /* 1D idct8x8 */
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
/* columns transform */ /* columns transform */
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
/* 1D idct8x8 */ /* 1D idct8x8 */
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
/* final rounding (add 2^4, divide by 2^5) and shift */ /* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H4_SH(in0, in1, in2, in3, 5); SRARI_H4_SH(in0, in1, in2, in3, 5);
SRARI_H4_SH(in4, in5, in6, in7, 5); SRARI_H4_SH(in4, in5, in6, in7, 5);
@ -82,12 +82,12 @@ void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
/* stage4 */ /* stage4 */
BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
in0, in1, in2, in3, in4, in5, in6, in7); in7);
TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in0, in1, in2, in3, in4, in5, in6, in7); in4, in5, in6, in7);
/* final rounding (add 2^4, divide by 2^5) and shift */ /* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H4_SH(in0, in1, in2, in3, 5); SRARI_H4_SH(in0, in1, in2, in3, 5);

View File

@ -16,7 +16,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
__asm__ __volatile__ ( __asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t" "lb %[tmp1], (%[left]) \n\t"
"lb %[tmp2], 1(%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t"
"lb %[tmp3], 2(%[left]) \n\t" "lb %[tmp3], 2(%[left]) \n\t"
@ -146,16 +146,13 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[tmp16], 8(%[dst]) \n\t" "sw %[tmp16], 8(%[dst]) \n\t"
"sw %[tmp16], 12(%[dst]) \n\t" "sw %[tmp16], 12(%[dst]) \n\t"
: [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
[tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
[tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8), [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
[tmp9] "=&r" (tmp9), [tmp10] "=&r" (tmp10), [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
[tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12), [tmp16] "=&r"(tmp16)
[tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14), : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
[tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
: [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
);
} }
void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
@ -165,7 +162,7 @@ void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
int32_t above2, left2; int32_t above2, left2;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[above1], (%[above]) \n\t" "lw %[above1], (%[above]) \n\t"
"lw %[above2], 4(%[above]) \n\t" "lw %[above2], 4(%[above]) \n\t"
"lw %[left1], (%[left]) \n\t" "lw %[left1], (%[left]) \n\t"
@ -316,14 +313,12 @@ void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[expected_dc], 8(%[dst]) \n\t" "sw %[expected_dc], 8(%[dst]) \n\t"
"sw %[expected_dc], 12(%[dst]) \n\t" "sw %[expected_dc], 12(%[dst]) \n\t"
: [left1] "=&r" (left1), [above1] "=&r" (above1), : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
[left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1), [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
[left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1), [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
[above2] "=&r" (above2), [left2] "=&r" (left2), [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
[average] "=&r" (average), [tmp] "=&r" (tmp), [expected_dc] "=&r"(expected_dc)
[expected_dc] "=&r" (expected_dc) : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
: [above] "r" (above), [left] "r" (left), [stride] "r"(stride));
[dst] "r" (dst), [stride] "r" (stride)
);
} }
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2

View File

@ -15,7 +15,7 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4; int32_t tmp1, tmp2, tmp3, tmp4;
__asm__ __volatile__ ( __asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t" "lb %[tmp1], (%[left]) \n\t"
"lb %[tmp2], 1(%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t"
"lb %[tmp3], 2(%[left]) \n\t" "lb %[tmp3], 2(%[left]) \n\t"
@ -32,10 +32,9 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"add %[dst], %[dst], %[stride] \n\t" "add %[dst], %[dst], %[stride] \n\t"
"sw %[tmp4], (%[dst]) \n\t" "sw %[tmp4], (%[dst]) \n\t"
: [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4) [tmp4] "=&r"(tmp4)
: [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
);
} }
void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
@ -44,7 +43,7 @@ void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
int32_t average; int32_t average;
int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[above_c], (%[above]) \n\t" "lw %[above_c], (%[above]) \n\t"
"lw %[left_c], (%[left]) \n\t" "lw %[left_c], (%[left]) \n\t"
@ -70,14 +69,13 @@ void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"add %[dst], %[dst], %[stride] \n\t" "add %[dst], %[dst], %[stride] \n\t"
"sw %[expected_dc], (%[dst]) \n\t" "sw %[expected_dc], (%[dst]) \n\t"
: [above_c] "=&r" (above_c), [above_l] "=&r" (above_l), : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
[above_r] "=&r" (above_r), [left_c] "=&r" (left_c), [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
[left_l] "=&r" (left_l), [left_r] "=&r" (left_r), [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
[average] "=&r" (average), [tmp] "=&r" (tmp), [average] "=&r"(average), [tmp] "=&r"(tmp),
[expected_dc] "=&r" (expected_dc) [expected_dc] "=&r"(expected_dc)
: [above] "r" (above), [left] "r" (left), : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
[dst] "r" (dst), [stride] "r" (stride) [stride] "r"(stride));
);
} }
void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
@ -90,7 +88,7 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
int32_t top_left; int32_t top_left;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[resl], (%[above]) \n\t" "ulw %[resl], (%[above]) \n\t"
"lbu %[left0], (%[left]) \n\t" "lbu %[left0], (%[left]) \n\t"
@ -174,7 +172,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"sra %[res0], %[res0], 16 \n\t" "sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t" "lbux %[res0], %[res0](%[cm]) \n\t"
"sra %[res1], %[resr], 16 \n\t" "sra %[res1], %[resr], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t" "lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], (%[dst]) \n\t" "sb %[res0], (%[dst]) \n\t"
@ -183,7 +180,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"sra %[res0], %[res0], 16 \n\t" "sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t" "lbux %[res0], %[res0](%[cm]) \n\t"
"sb %[res1], 1(%[dst]) \n\t" "sb %[res1], 1(%[dst]) \n\t"
"sra %[res1], %[resl], 16 \n\t" "sra %[res1], %[resl], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t" "lbux %[res1], %[res1](%[cm]) \n\t"
@ -218,12 +214,11 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
"sb %[res0], 2(%[dst]) \n\t" "sb %[res0], 2(%[dst]) \n\t"
"sb %[res1], 3(%[dst]) \n\t" "sb %[res1], 3(%[dst]) \n\t"
: [abovel] "=&r" (abovel), [abover] "=&r" (abover), : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
[left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2), [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
[res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3), [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
[resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left) [resr] "=&r"(resr), [top_left] "=&r"(top_left)
: [above] "r" (above), [left] "r" (left), : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
[dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) [stride] "r"(stride), [cm] "r"(cm));
);
} }
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2

View File

@ -15,7 +15,7 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) { const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
__asm__ __volatile__ ( __asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t" "lb %[tmp1], (%[left]) \n\t"
"lb %[tmp2], 1(%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t"
"lb %[tmp3], 2(%[left]) \n\t" "lb %[tmp3], 2(%[left]) \n\t"
@ -58,13 +58,10 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[tmp8], (%[dst]) \n\t" "sw %[tmp8], (%[dst]) \n\t"
"sw %[tmp8], 4(%[dst]) \n\t" "sw %[tmp8], 4(%[dst]) \n\t"
: [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
[tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
[tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8) : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
: [left] "r" (left), [dst] "r" (dst),
[stride] "r" (stride)
);
} }
void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
@ -74,7 +71,7 @@ void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[above1], (%[above]) \n\t" "lw %[above1], (%[above]) \n\t"
"lw %[above2], 4(%[above]) \n\t" "lw %[above2], 4(%[above]) \n\t"
"lw %[left1], (%[left]) \n\t" "lw %[left1], (%[left]) \n\t"
@ -137,17 +134,16 @@ void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
"sw %[expected_dc], (%[dst]) \n\t" "sw %[expected_dc], (%[dst]) \n\t"
"sw %[expected_dc], 4(%[dst]) \n\t" "sw %[expected_dc], 4(%[dst]) \n\t"
: [above1] "=&r" (above1), [above_l1] "=&r" (above_l1), : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
[above_r1] "=&r" (above_r1), [left1] "=&r" (left1), [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
[left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1), [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
[above2] "=&r" (above2), [above_l2] "=&r" (above_l2), [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
[above_r2] "=&r" (above_r2), [left2] "=&r" (left2), [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
[left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2), [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
[average] "=&r" (average), [tmp] "=&r" (tmp), [average] "=&r"(average), [tmp] "=&r"(tmp),
[expected_dc] "=&r" (expected_dc) [expected_dc] "=&r"(expected_dc)
: [above] "r" (above), [left] "r" (left), [dst] "r" (dst), : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
[stride] "r" (stride) [stride] "r"(stride));
);
} }
void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
@ -160,7 +156,7 @@ void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
int32_t top_left; int32_t top_left;
uint8_t *cm = vpx_ff_cropTbl; uint8_t *cm = vpx_ff_cropTbl;
__asm__ __volatile__ ( __asm__ __volatile__(
"ulw %[reshw], (%[above]) \n\t" "ulw %[reshw], (%[above]) \n\t"
"ulw %[top_left], 4(%[above]) \n\t" "ulw %[top_left], 4(%[above]) \n\t"
@ -595,13 +591,12 @@ void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
"sb %[res2], 6(%[dst]) \n\t" "sb %[res2], 6(%[dst]) \n\t"
"sb %[res3], 7(%[dst]) \n\t" "sb %[res3], 7(%[dst]) \n\t"
: [abovel] "=&r" (abovel), [abover] "=&r" (abover), : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
[abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1), [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
[left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3), [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
[res0] "=&r" (res0), [res1] "=&r" (res1), [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
[reshw] "=&r" (reshw), [top_left] "=&r" (top_left) [top_left] "=&r"(top_left)
: [above] "r" (above), [left] "r" (left), : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
[dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) [stride] "r"(stride), [cm] "r"(cm));
);
} }
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2

View File

@ -11,10 +11,11 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/macros_msa.h"
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \ #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
{ \
out0 = __msa_subs_u_h(out0, in0); \ out0 = __msa_subs_u_h(out0, in0); \
out1 = __msa_subs_u_h(out1, in1); \ out1 = __msa_subs_u_h(out1, in1); \
} }
static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
int32_t dst_stride) { int32_t dst_stride) {
@ -150,8 +151,8 @@ static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
} }
static void intra_predict_dc_4x4_msa(const uint8_t *src_top, static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint32_t val0, val1; uint32_t val0, val1;
v16i8 store, src = { 0 }; v16i8 store, src = { 0 };
v8u16 sum_h; v8u16 sum_h;
@ -199,8 +200,8 @@ static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
} }
static void intra_predict_dc_8x8_msa(const uint8_t *src_top, static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint64_t val0, val1; uint64_t val0, val1;
v16i8 store; v16i8 store;
v16u8 src = { 0 }; v16u8 src = { 0 };
@ -260,8 +261,8 @@ static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
} }
static void intra_predict_dc_16x16_msa(const uint8_t *src_top, static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
v16u8 top, left, out; v16u8 top, left, out;
v8u16 sum_h, sum_top, sum_left; v8u16 sum_h, sum_top, sum_left;
v4u32 sum_w; v4u32 sum_w;
@ -313,8 +314,8 @@ static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
} }
static void intra_predict_dc_32x32_msa(const uint8_t *src_top, static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint32_t row; uint32_t row;
v16u8 top0, top1, left0, left1, out; v16u8 top0, top1, left0, left1, out;
v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
@ -381,8 +382,8 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
} }
static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint32_t val; uint32_t val;
uint8_t top_left = src_top_ptr[-1]; uint8_t top_left = src_top_ptr[-1];
v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
@ -409,8 +410,8 @@ static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
} }
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint64_t val; uint64_t val;
uint8_t top_left = src_top_ptr[-1]; uint8_t top_left = src_top_ptr[-1];
uint32_t loop_cnt; uint32_t loop_cnt;
@ -442,8 +443,8 @@ static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
} }
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint8_t top_left = src_top_ptr[-1]; uint8_t top_left = src_top_ptr[-1];
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src_top, src_left0, src_left1, src_left2, src_left3; v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
@ -491,8 +492,8 @@ static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
} }
static void intra_predict_tm_32x32_msa(const uint8_t *src_top, static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
const uint8_t *src_left, const uint8_t *src_left, uint8_t *dst,
uint8_t *dst, int32_t dst_stride) { int32_t dst_stride) {
uint8_t top_left = src_top[-1]; uint8_t top_left = src_top[-1];
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;

View File

@ -23,31 +23,39 @@ extern "C" {
#endif #endif
#if HAVE_DSPR2 #if HAVE_DSPR2
#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \ #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \
({ \
\ \
int32_t tmp, out; \ int32_t tmp, out; \
int dct_cost_rounding = DCT_CONST_ROUNDING; \ int dct_cost_rounding = DCT_CONST_ROUNDING; \
int in = input; \ int in = input; \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \
/* out = dct_const_round_shift(input_dc * cospi_16_64); */ \ "mtlo %[dct_cost_rounding], $ac1 " \
"mtlo %[dct_cost_rounding], $ac1 \n\t"\ " \n\t" \
"mthi $zero, $ac1 \n\t"\ "mthi $zero, $ac1 " \
"madd $ac1, %[in], %[cospi_16_64] \n\t"\ " \n\t" \
"extp %[tmp], $ac1, 31 \n\t"\ "madd $ac1, %[in], " \
"%[cospi_16_64] \n\t" \
"extp %[tmp], $ac1, " \
"31 \n\t" \
\ \
/* out = dct_const_round_shift(out * cospi_16_64); */ \ /* out = dct_const_round_shift(out * cospi_16_64); */ \
"mtlo %[dct_cost_rounding], $ac2 \n\t"\ "mtlo %[dct_cost_rounding], $ac2 " \
"mthi $zero, $ac2 \n\t"\ " \n\t" \
"madd $ac2, %[tmp], %[cospi_16_64] \n\t"\ "mthi $zero, $ac2 " \
"extp %[out], $ac2, 31 \n\t"\ " \n\t" \
"madd $ac2, %[tmp], " \
"%[cospi_16_64] \n\t" \
"extp %[out], $ac2, " \
"31 \n\t" \
\ \
: [tmp] "=&r" (tmp), [out] "=r" (out) \ : [tmp] "=&r"(tmp), [out] "=r"(out) \
: [in] "r" (in), \ : [in] "r"(in), \
[dct_cost_rounding] "r" (dct_cost_rounding), \ [dct_cost_rounding] "r"(dct_cost_rounding), \
[cospi_16_64] "r" (cospi_16_64) \ [cospi_16_64] "r"(cospi_16_64)); \
); \ out; \
out; }) })
void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
int dest_stride); int dest_stride);
@ -59,10 +67,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
int dest_stride); int dest_stride);
void iadst8_dspr2(const int16_t *input, int16_t *output); void iadst8_dspr2(const int16_t *input, int16_t *output);
void idct16_rows_dspr2(const int16_t *input, int16_t *output, void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
uint32_t no_rows); void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
int dest_stride);
void iadst16_dspr2(const int16_t *input, int16_t *output); void iadst16_dspr2(const int16_t *input, int16_t *output);
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2

View File

@ -15,14 +15,15 @@
#include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h"
#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/txfm_common.h"
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
out0, out1, out2, out3, out4, out5, out6, out7) { \ out3, out4, out5, out6, out7) \
{ \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \ v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
-cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \ cospi_24_64, -cospi_24_64, 0, 0 }; \
\ \
SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
cnst2_m = -cnst0_m; \ cnst2_m = -cnst0_m; \
@ -33,9 +34,8 @@
\ \
ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
cnst1_m, cnst2_m, cnst3_m, in7, in0, \ cnst2_m, cnst3_m, in7, in0, in4, in3); \
in4, in3); \
\ \
SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
cnst2_m = -cnst0_m; \ cnst2_m = -cnst0_m; \
@ -47,15 +47,13 @@
ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
\ \
DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
cnst1_m, cnst2_m, cnst3_m, in5, in2, \ cnst2_m, cnst3_m, in5, in2, in6, in1); \
in6, in1); \
BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
out7 = -s0_m; \ out7 = -s0_m; \
out0 = s1_m; \ out0 = s1_m; \
\ \
SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \ SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
\ \
ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
@ -63,9 +61,8 @@
\ \
ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
cnst2_m, cnst3_m, cnst1_m, out1, out6, \ cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
s0_m, s1_m); \
\ \
SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
@ -80,9 +77,10 @@
out1 = -out1; \ out1 = -out1; \
out3 = -out3; \ out3 = -out3; \
out5 = -out5; \ out5 = -out5; \
} }
#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \ #define VP9_SET_COSPI_PAIR(c0_h, c1_h) \
({ \
v8i16 out0_m, r0_m, r1_m; \ v8i16 out0_m, r0_m, r1_m; \
\ \
r0_m = __msa_fill_h(c0_h); \ r0_m = __msa_fill_h(c0_h); \
@ -90,26 +88,28 @@
out0_m = __msa_ilvev_h(r1_m, r0_m); \ out0_m = __msa_ilvev_h(r1_m, r0_m); \
\ \
out0_m; \ out0_m; \
}) })
#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \ #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \
uint8_t *dst_m = (uint8_t *) (dst); \ { \
uint8_t *dst_m = (uint8_t *)(dst); \
v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
v16i8 tmp0_m, tmp1_m; \ v16i8 tmp0_m, tmp1_m; \
v16i8 zero_m = { 0 }; \ v16i8 zero_m = { 0 }; \
v8i16 res0_m, res1_m, res2_m, res3_m; \ v8i16 res0_m, res1_m, res2_m, res3_m; \
\ \
LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \ ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \
ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \
res0_m, res1_m, res2_m, res3_m); \ res0_m, res1_m, res2_m, res3_m); \
ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \
res2_m, res3_m); \
CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
} }
#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
v8i16 c0_m, c1_m, c2_m, c3_m; \ v8i16 c0_m, c1_m, c2_m, c3_m; \
v8i16 step0_m, step1_m; \ v8i16 step0_m, step1_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
@ -127,20 +127,19 @@
\ \
PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \ BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
(v8i16)tmp2_m, (v8i16)tmp3_m, \
out0, out1, out2, out3); \ out0, out1, out2, out3); \
} }
#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
v8i16 res0_m, res1_m, c0_m, c1_m; \ v8i16 res0_m, res1_m, c0_m, c1_m; \
v8i16 k1_m, k2_m, k3_m, k4_m; \ v8i16 k1_m, k2_m, k3_m, k4_m; \
v8i16 zero_m = { 0 }; \ v8i16 zero_m = { 0 }; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v4i32 int0_m, int1_m, int2_m, int3_m; \ v4i32 int0_m, int1_m, int2_m, int3_m; \
v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \ v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \
sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \ -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
-sinpi_4_9 }; \
\ \
SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
@ -180,38 +179,41 @@
SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
} }
#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \ #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \
({ \
v8i16 c0_m, c1_m; \ v8i16 c0_m, c1_m; \
\ \
SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
c0_m = __msa_ilvev_h(c1_m, c0_m); \ c0_m = __msa_ilvev_h(c1_m, c0_m); \
\ \
c0_m; \ c0_m; \
}) })
/* multiply and add macro */ /* multiply and add macro */
#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
out0, out1, out2, out3) { \ out2, out3) \
{ \
v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\ \
ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
} }
/* idct 8x8 macro */ /* idct 8x8 macro */
#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out0, out1, out2, out3, out4, out5, out6, out7) { \ out2, out3, out4, out5, out6, out7) \
{ \
v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
@ -236,59 +238,60 @@
tp7_m = in7 + in5; \ tp7_m = in7 + in5; \
k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
in0, in4, in2, in6); \
BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
out0, out1, out2, out3, out4, out5, out6, out7); \ out1, out2, out3, out4, out5, out6, out7); \
} }
#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out0, out1, out2, out3, out4, out5, out6, out7) { \ out2, out3, out4, out5, out6, out7) \
{ \
v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \ v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \
cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \ v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \
cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \ v8i16 mask3_m = { \
-cospi_16_64, 0, 0, 0, 0 }; \ -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \
}; \
\ \
k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
r0_m, r1_m, r2_m, r3_m); \ r1_m, r2_m, r3_m); \
k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
r4_m, r5_m, r6_m, r7_m); \ r5_m, r6_m, r7_m); \
ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
m0_m, m1_m, m2_m, m3_m); \ m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
m0_m, m1_m, m2_m, m3_m); \ m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
r0_m, r1_m, r2_m, r3_m); \ r1_m, r2_m, r3_m); \
k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \
r4_m, r5_m, r6_m, r7_m); \ r5_m, r6_m, r7_m); \
ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
m0_m, m1_m, m2_m, m3_m); \ m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \
m0_m, m1_m, m2_m, m3_m); \ m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
@ -296,29 +299,29 @@
k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \
r0_m, r1_m, r2_m, r3_m); \ r1_m, r2_m, r3_m); \
k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \
r4_m, r5_m, r6_m, r7_m); \ r6_m, r7_m); \
ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
m0_m, m1_m, m2_m, m3_m); \ m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \
m0_m, m1_m, m2_m, m3_m); \ m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \
m0_m, m1_m, m2_m, m3_m); \ m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \
m0_m, m1_m, m2_m, m3_m); \ m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
\ \
@ -326,13 +329,13 @@
out3 = -in3; \ out3 = -in3; \
out5 = -in5; \ out5 = -in5; \
out7 = -in7; \ out7 = -in7; \
} }
#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \ #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \
r9, r10, r11, r12, r13, r14, r15, \ r12, r13, r14, r15, out0, out1, out2, out3, out4, \
out0, out1, out2, out3, out4, out5, \ out5, out6, out7, out8, out9, out10, out11, out12, \
out6, out7, out8, out9, out10, out11, \ out13, out14, out15) \
out12, out13, out14, out15) { \ { \
v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
@ -344,51 +347,49 @@
k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \
g0_m, g1_m, g2_m, g3_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
g4_m, g5_m, g6_m, g7_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \
g8_m, g9_m, g10_m, g11_m); \ g11_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \
g12_m, g13_m, g14_m, g15_m); \ g15_m); \
\ \
/* stage 2 */ \ /* stage 2 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
h0_m, h1_m, h2_m, h3_m); \ h3_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \
h4_m, h5_m, h6_m, h7_m); \ h6_m, h7_m); \
BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
\ \
/* stage 3 */ \ /* stage 3 */ \
BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \
out4, out6, out5, out7); \ out7); \
MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \
out12, out14, out13, out15); \ out13, out15); \
\ \
/* stage 4 */ \ /* stage 4 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
@ -399,7 +400,7 @@
MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
} }
void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
int32_t dst_stride); int32_t dst_stride);

View File

@ -26,11 +26,11 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
int result1, result2, result3, result4; int result1, result2, result3, result4;
const int const_2_power_13 = 8192; const int const_2_power_13 = 8192;
for (i = no_rows; i--; ) { for (i = no_rows; i--;) {
/* prefetch row */ /* prefetch row */
prefetch_load((const uint8_t *)(input + 16)); prefetch_load((const uint8_t *)(input + 16));
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t" "lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 16(%[input]) \n\t" "lh %[load2], 16(%[input]) \n\t"
"lh %[load3], 8(%[input]) \n\t" "lh %[load3], 8(%[input]) \n\t"
@ -64,19 +64,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[step1_2], %[step2_1], %[step2_2] \n\t" "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
"sub %[step1_3], %[step2_0], %[step2_3] \n\t" "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
[step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
[step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
[step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
[step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) [step1_3] "=r"(step1_3)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_16_64] "r" (cospi_16_64) [cospi_16_64] "r"(cospi_16_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load5], 2(%[input]) \n\t" "lh %[load5], 2(%[input]) \n\t"
"lh %[load6], 30(%[input]) \n\t" "lh %[load6], 30(%[input]) \n\t"
"lh %[load7], 18(%[input]) \n\t" "lh %[load7], 18(%[input]) \n\t"
@ -126,19 +125,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_8], %[result1], %[result2] \n\t" "add %[step2_8], %[result1], %[result2] \n\t"
"add %[step2_15], %[result4], %[result3] \n\t" "add %[step2_15], %[result4], %[result3] \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load7] "=&r" (load7), [load8] "=&r" (load8), [load8] "=&r"(load8), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [result3] "=&r"(result3),
[result3] "=&r" (result3), [result4] "=&r" (result4), [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
[step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
[step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) [step2_14] "=r"(step2_14)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
[cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t" "lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 22(%[input]) \n\t" "lh %[load2], 22(%[input]) \n\t"
"lh %[load3], 26(%[input]) \n\t" "lh %[load3], 26(%[input]) \n\t"
@ -188,19 +186,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_11], %[result1], %[result2] \n\t" "add %[step2_11], %[result1], %[result2] \n\t"
"add %[step2_12], %[result4], %[result3] \n\t" "add %[step2_12], %[result4], %[result3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [result3] "=&r"(result3),
[result3] "=&r" (result3), [result4] "=&r" (result4), [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
[step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
[step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) [step2_13] "=r"(step2_13)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
[cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load5], 4(%[input]) \n\t" "lh %[load5], 4(%[input]) \n\t"
"lh %[load6], 28(%[input]) \n\t" "lh %[load6], 28(%[input]) \n\t"
"lh %[load7], 20(%[input]) \n\t" "lh %[load7], 20(%[input]) \n\t"
@ -253,19 +250,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_4], %[result1], %[result2] \n\t" "add %[step1_4], %[result1], %[result2] \n\t"
"add %[step1_7], %[result4], %[result3] \n\t" "add %[step1_7], %[result4], %[result3] \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load7] "=&r" (load7), [load8] "=&r" (load8), [load8] "=&r"(load8), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [result3] "=&r"(result3),
[result3] "=&r" (result3), [result4] "=&r" (result4), [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
[step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
[step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) [step1_7] "=r"(step1_7)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_16_64] "r" (cospi_16_64) [cospi_16_64] "r"(cospi_16_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"mtlo %[const_2_power_13], $ac1 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t"
@ -305,18 +301,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"extp %[step1_11], $ac2, 31 \n\t" "extp %[step1_11], $ac2, 31 \n\t"
"extp %[step1_12], $ac3, 31 \n\t" "extp %[step1_12], $ac3, 31 \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
[step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
[step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) [step1_13] "=r"(step1_13)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
[step2_14] "r" (step2_14), [step2_13] "r" (step2_13), [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
[step2_9] "r" (step2_9), [step2_10] "r" (step2_10), [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
[step2_15] "r" (step2_15), [step2_12] "r" (step2_12), [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
[step2_8] "r" (step2_8), [step2_11] "r" (step2_11), [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"add %[load5], %[step1_0], %[step1_7] \n\t" "add %[load5], %[step1_0], %[step1_7] \n\t"
"add %[load5], %[load5], %[step2_12] \n\t" "add %[load5], %[load5], %[step2_12] \n\t"
"add %[load5], %[load5], %[step2_15] \n\t" "add %[load5], %[load5], %[step2_15] \n\t"
@ -350,17 +344,15 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"sh %[load5], 448(%[output]) \n\t" "sh %[load5], 448(%[output]) \n\t"
"sh %[load6], 480(%[output]) \n\t" "sh %[load6], 480(%[output]) \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6) : [load5] "=&r"(load5), [load6] "=&r"(load6)
: [output] "r" (output), : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
[step1_0] "r" (step1_0), [step1_1] "r" (step1_1), [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
[step1_6] "r" (step1_6), [step1_7] "r" (step1_7), [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
[step2_8] "r" (step2_8), [step2_9] "r" (step2_9), [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
[step2_10] "r" (step2_10), [step2_11] "r" (step2_11), [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
[step2_12] "r" (step2_12), [step2_13] "r" (step2_13), [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
[step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"add %[load5], %[step1_2], %[step1_5] \n\t" "add %[load5], %[step1_2], %[step1_5] \n\t"
"add %[load5], %[load5], %[step1_13] \n\t" "add %[load5], %[load5], %[step1_13] \n\t"
"add %[load6], %[step1_3], %[step1_4] \n\t" "add %[load6], %[step1_3], %[step1_4] \n\t"
@ -386,21 +378,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
"sh %[load5], 384(%[output]) \n\t" "sh %[load5], 384(%[output]) \n\t"
"sh %[load6], 416(%[output]) \n\t" "sh %[load6], 416(%[output]) \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6) : [load5] "=&r"(load5), [load6] "=&r"(load6)
: [output] "r" (output), : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
[step1_2] "r" (step1_2), [step1_3] "r" (step1_3), [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
[step1_4] "r" (step1_4), [step1_5] "r" (step1_5), [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
[step1_10] "r" (step1_10), [step1_11] "r" (step1_11), [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
[step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
);
input += 16; input += 16;
output += 1; output += 1;
} }
} }
void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
int dest_stride) {
int i; int i;
int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
int step1_8, step1_9, step1_10, step1_11; int step1_8, step1_9, step1_10, step1_11;
@ -426,7 +415,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
for (i = 0; i < 16; ++i) { for (i = 0; i < 16; ++i) {
dest_pix = (dest + i); dest_pix = (dest + i);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t" "lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 16(%[input]) \n\t" "lh %[load2], 16(%[input]) \n\t"
"lh %[load3], 8(%[input]) \n\t" "lh %[load3], 8(%[input]) \n\t"
@ -460,19 +449,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sub %[step1_2], %[step2_1], %[step2_2] \n\t" "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
"sub %[step1_3], %[step2_0], %[step2_3] \n\t" "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
[step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
[step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
[step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
[step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) [step1_3] "=r"(step1_3)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_16_64] "r" (cospi_16_64) [cospi_16_64] "r"(cospi_16_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load5], 2(%[input]) \n\t" "lh %[load5], 2(%[input]) \n\t"
"lh %[load6], 30(%[input]) \n\t" "lh %[load6], 30(%[input]) \n\t"
"lh %[load7], 18(%[input]) \n\t" "lh %[load7], 18(%[input]) \n\t"
@ -522,19 +510,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_8], %[result1], %[result2] \n\t" "add %[step2_8], %[result1], %[result2] \n\t"
"add %[step2_15], %[result4], %[result3] \n\t" "add %[step2_15], %[result4], %[result3] \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load7] "=&r" (load7), [load8] "=&r" (load8), [load8] "=&r"(load8), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [result3] "=&r"(result3),
[result3] "=&r" (result3), [result4] "=&r" (result4), [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
[step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
[step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) [step2_14] "=r"(step2_14)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
[cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t" "lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 22(%[input]) \n\t" "lh %[load2], 22(%[input]) \n\t"
"lh %[load3], 26(%[input]) \n\t" "lh %[load3], 26(%[input]) \n\t"
@ -584,19 +571,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_11], %[result1], %[result2] \n\t" "add %[step2_11], %[result1], %[result2] \n\t"
"add %[step2_12], %[result4], %[result3] \n\t" "add %[step2_12], %[result4], %[result3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [result3] "=&r"(result3),
[result3] "=&r" (result3), [result4] "=&r" (result4), [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
[step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
[step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) [step2_13] "=r"(step2_13)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
[cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load5], 4(%[input]) \n\t" "lh %[load5], 4(%[input]) \n\t"
"lh %[load6], 28(%[input]) \n\t" "lh %[load6], 28(%[input]) \n\t"
"lh %[load7], 20(%[input]) \n\t" "lh %[load7], 20(%[input]) \n\t"
@ -650,19 +636,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_4], %[result1], %[result2] \n\t" "add %[step1_4], %[result1], %[result2] \n\t"
"add %[step1_7], %[result4], %[result3] \n\t" "add %[step1_7], %[result4], %[result3] \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load7] "=&r" (load7), [load8] "=&r" (load8), [load8] "=&r"(load8), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [result3] "=&r"(result3),
[result3] "=&r" (result3), [result4] "=&r" (result4), [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
[step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
[step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) [step1_7] "=r"(step1_7)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_16_64] "r" (cospi_16_64) [cospi_16_64] "r"(cospi_16_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"mtlo %[const_2_power_13], $ac1 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t"
@ -702,23 +687,21 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"extp %[step1_11], $ac2, 31 \n\t" "extp %[step1_11], $ac2, 31 \n\t"
"extp %[step1_12], $ac3, 31 \n\t" "extp %[step1_12], $ac3, 31 \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
[step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
[step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) [step1_13] "=r"(step1_13)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
[step2_14] "r" (step2_14), [step2_13] "r" (step2_13), [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
[step2_9] "r" (step2_9), [step2_10] "r" (step2_10), [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
[step2_15] "r" (step2_15), [step2_12] "r" (step2_12), [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
[step2_8] "r" (step2_8), [step2_11] "r" (step2_11), [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
step1_8 = step2_8 + step2_11; step1_8 = step2_8 + step2_11;
step1_9 = step2_9 + step2_10; step1_9 = step2_9 + step2_10;
step1_14 = step2_13 + step2_14; step1_14 = step2_13 + step2_14;
step1_15 = step2_12 + step2_15; step1_15 = step2_12 + step2_15;
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[load7], 0(%[dest_pix]) \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t"
"add %[load5], %[step1_0], %[step1_7] \n\t" "add %[load5], %[step1_0], %[step1_7] \n\t"
"add %[load5], %[load5], %[step1_15] \n\t" "add %[load5], %[load5], %[step1_15] \n\t"
@ -870,18 +853,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[load6], %[load8](%[cm]) \n\t" "lbux %[load6], %[load8](%[cm]) \n\t"
"sb %[load6], 0(%[dest_pix]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t"
: [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
[load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), :
[step1_0] "r" (step1_0), [step1_1] "r" (step1_1), [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
[step1_2] "r" (step1_2), [step1_3] "r" (step1_3), [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
[step1_4] "r" (step1_4), [step1_5] "r" (step1_5), [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
[step1_6] "r" (step1_6), [step1_7] "r" (step1_7), [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
[step1_8] "r" (step1_8), [step1_9] "r" (step1_9), [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
[step1_10] "r" (step1_10), [step1_11] "r" (step1_11), [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
[step1_12] "r" (step1_12), [step1_13] "r" (step1_13), [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
[step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
);
input += 16; input += 16;
} }
@ -893,11 +874,7 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
"wrdsp %[pos], 1 \n\t"
:
: [pos] "r" (pos)
);
// First transform rows // First transform rows
idct16_rows_dspr2(input, out, 16); idct16_rows_dspr2(input, out, 16);
@ -914,11 +891,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
"wrdsp %[pos], 1 \n\t"
:
: [pos] "r" (pos)
);
// First transform rows. Since all non-zero dct coefficients are in // First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here. // upper-left 4x4 area, we only need to calculate first 4 rows here.
@ -926,7 +899,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
outptr += 4; outptr += 4;
for (i = 0; i < 6; ++i) { for (i = 0; i < 6; ++i) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t" "sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 32(%[outptr]) \n\t" "sw $zero, 32(%[outptr]) \n\t"
"sw $zero, 64(%[outptr]) \n\t" "sw $zero, 64(%[outptr]) \n\t"
@ -945,8 +918,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 480(%[outptr]) \n\t" "sw $zero, 480(%[outptr]) \n\t"
: :
: [outptr] "r" (outptr) : [outptr] "r"(outptr));
);
outptr += 2; outptr += 2;
} }
@ -966,35 +938,31 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
int32_t vector_1, vector_2, vector_3, vector_4; int32_t vector_1, vector_2, vector_3, vector_4;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
__asm__ __volatile__ ( __asm__ __volatile__(
"addi %[out], %[out], 32 \n\t" "addi %[out], %[out], 32 \n\t"
"sra %[a1], %[out], 6 \n\t" "sra %[a1], %[out], 6 \n\t"
: [out] "+r" (out), [a1] "=r" (a1) : [out] "+r"(out), [a1] "=r"(a1)
: :);
);
if (a1 < 0) { if (a1 < 0) {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t" "abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t"
: [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 16; r--;) { for (r = 16; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t" "lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t"
@ -1009,25 +977,22 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_4], 12(%[dest]) \n\t" "sw %[vector_4], 12(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t" "add %[dest], %[dest], %[dest_stride] \n\t"
: [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r" (dest) [dest] "+&r"(dest)
: [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
);
} }
} else { } else {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
"replv.qb %[vector_a1], %[a1] \n\t"
: [vector_a1] "=r" (vector_a1) : [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 16; r--;) { for (r = 16; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t" "lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t"
@ -1042,12 +1007,11 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_4], 12(%[dest]) \n\t" "sw %[vector_4], 12(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t" "add %[dest], %[dest], %[dest_stride] \n\t"
: [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r" (dest) [dest] "+&r"(dest)
: [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
);
} }
} }
} }
@ -1072,12 +1036,11 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
int x14 = input[1]; int x14 = input[1];
int x15 = input[14]; int x15 = input[14];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) { x13 | x14 | x15)) {
output[0] = output[1] = output[2] = output[3] = output[4] output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
= output[5] = output[6] = output[7] = output[8] output[6] = output[7] = output[8] = output[9] = output[10] =
= output[9] = output[10] = output[11] = output[12] output[11] = output[12] = output[13] = output[14] = output[15] = 0;
= output[13] = output[14] = output[15] = 0;
return; return;
} }
@ -1129,9 +1092,9 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
s9 = x8 * cospi_28_64 - x9 * cospi_4_64; s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
s10 = x10 * cospi_20_64 + x11 * cospi_12_64; s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
s11 = x10 * cospi_12_64 - x11 * cospi_20_64; s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
s13 = x12 * cospi_4_64 + x13 * cospi_28_64; s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
s15 = x14 * cospi_20_64 + x15 * cospi_12_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
x0 = s0 + s4; x0 = s0 + s4;
@ -1158,7 +1121,7 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
s3 = x3; s3 = x3;
s4 = x4 * cospi_8_64 + x5 * cospi_24_64; s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
s5 = x4 * cospi_24_64 - x5 * cospi_8_64; s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
s7 = x6 * cospi_8_64 + x7 * cospi_24_64; s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
s8 = x8; s8 = x8;
s9 = x9; s9 = x9;
@ -1166,7 +1129,7 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
s11 = x11; s11 = x11;
s12 = x12 * cospi_8_64 + x13 * cospi_24_64; s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
s13 = x12 * cospi_24_64 - x13 * cospi_8_64; s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
s15 = x14 * cospi_8_64 + x15 * cospi_24_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = s0 + s2; x0 = s0 + s2;
@ -1187,13 +1150,13 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
x15 = dct_const_round_shift(s13 - s15); x15 = dct_const_round_shift(s13 - s15);
// stage 4 // stage 4
s2 = (- cospi_16_64) * (x2 + x3); s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3); s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7); s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (- x6 + x7); s7 = cospi_16_64 * (-x6 + x7);
s10 = cospi_16_64 * (x10 + x11); s10 = cospi_16_64 * (x10 + x11);
s11 = cospi_16_64 * (- x10 + x11); s11 = cospi_16_64 * (-x10 + x11);
s14 = (- cospi_16_64) * (x14 + x15); s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15); s15 = cospi_16_64 * (x14 - x15);
x2 = dct_const_round_shift(s2); x2 = dct_const_round_shift(s2);
@ -1223,5 +1186,4 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
output[15] = -x1; output[15] = -x1;
} }
#endif // HAVE_DSPR2 #endif // HAVE_DSPR2

View File

@ -51,7 +51,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
dest_pix = dest + i; dest_pix = dest + i;
dest_pix1 = dest + i + 31 * dest_stride; dest_pix1 = dest + i + 31 * dest_stride;
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 2(%[input]) \n\t" "lh %[load1], 2(%[input]) \n\t"
"lh %[load2], 62(%[input]) \n\t" "lh %[load2], 62(%[input]) \n\t"
"lh %[load3], 34(%[input]) \n\t" "lh %[load3], 34(%[input]) \n\t"
@ -101,18 +101,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_16], %[temp0], %[temp1] \n\t" "add %[step1_16], %[temp0], %[temp1] \n\t"
"add %[step1_31], %[temp2], %[temp3] \n\t" "add %[step1_31], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
[step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
[step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) [step1_31] "=r"(step1_31)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
[cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 18(%[input]) \n\t" "lh %[load1], 18(%[input]) \n\t"
"lh %[load2], 46(%[input]) \n\t" "lh %[load2], 46(%[input]) \n\t"
"lh %[load3], 50(%[input]) \n\t" "lh %[load3], 50(%[input]) \n\t"
@ -162,18 +161,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_19], %[temp0], %[temp1] \n\t" "add %[step1_19], %[temp0], %[temp1] \n\t"
"add %[step1_28], %[temp2], %[temp3] \n\t" "add %[step1_28], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
[step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
[step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) [step1_29] "=r"(step1_29)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
[cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t" "lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 54(%[input]) \n\t" "lh %[load2], 54(%[input]) \n\t"
"lh %[load3], 42(%[input]) \n\t" "lh %[load3], 42(%[input]) \n\t"
@ -223,18 +221,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_20], %[temp0], %[temp1] \n\t" "add %[step1_20], %[temp0], %[temp1] \n\t"
"add %[step1_27], %[temp2], %[temp3] \n\t" "add %[step1_27], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
[step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
[step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) [step1_27] "=r"(step1_27)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
[cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
[cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 26(%[input]) \n\t" "lh %[load1], 26(%[input]) \n\t"
"lh %[load2], 38(%[input]) \n\t" "lh %[load2], 38(%[input]) \n\t"
"lh %[load3], 58(%[input]) \n\t" "lh %[load3], 58(%[input]) \n\t"
@ -280,18 +277,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_23], %[temp0], %[temp1] \n\t" "add %[step1_23], %[temp0], %[temp1] \n\t"
"add %[step1_24], %[temp2], %[temp3] \n\t" "add %[step1_24], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
[step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
[step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) [step1_25] "=r"(step1_25)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
[cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
[cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 4(%[input]) \n\t" "lh %[load1], 4(%[input]) \n\t"
"lh %[load2], 60(%[input]) \n\t" "lh %[load2], 60(%[input]) \n\t"
"lh %[load3], 36(%[input]) \n\t" "lh %[load3], 36(%[input]) \n\t"
@ -337,18 +333,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_8], %[temp0], %[temp1] \n\t" "add %[step2_8], %[temp0], %[temp1] \n\t"
"add %[step2_15], %[temp2], %[temp3] \n\t" "add %[step2_15], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
[step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
[step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) [step2_15] "=r"(step2_15)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
[cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
[cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 20(%[input]) \n\t" "lh %[load1], 20(%[input]) \n\t"
"lh %[load2], 44(%[input]) \n\t" "lh %[load2], 44(%[input]) \n\t"
"lh %[load3], 52(%[input]) \n\t" "lh %[load3], 52(%[input]) \n\t"
@ -394,18 +389,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step2_11], %[temp0], %[temp1] \n\t" "add %[step2_11], %[temp0], %[temp1] \n\t"
"add %[step2_12], %[temp2], %[temp3] \n\t" "add %[step2_12], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
[step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
[step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) [step2_13] "=r"(step2_13)
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
[cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
[cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
[cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"sub %[temp0], %[step2_14], %[step2_13] \n\t" "sub %[temp0], %[step2_14], %[step2_13] \n\t"
@ -440,33 +434,31 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_11], $ac2, 31 \n\t"
"extp %[step3_12], $ac3, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
[step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
[step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
[step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
[step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) [step3_15] "=r"(step3_15)
: [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8), : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
[step2_9] "r" (step2_9), [step2_10] "r" (step2_10), [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
[step2_11] "r" (step2_11), [step2_12] "r" (step2_12), [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
[step2_13] "r" (step2_13), [step2_14] "r" (step2_14), [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
[step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64) [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
);
step2_18 = step1_17 - step1_18; step2_18 = step1_17 - step1_18;
step2_29 = step1_30 - step1_29; step2_29 = step1_30 - step1_29;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_18], %[cospi_8_64] \n\t" "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_29], %[cospi_24_64] \n\t" "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
"extp %[step3_18], $ac0, 31 \n\t" "extp %[step3_18], $ac0, 31 \n\t"
: [step3_18] "=r" (step3_18) : [step3_18] "=r"(step3_18)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
[step2_18] "r" (step2_18), [step2_29] "r" (step2_29), [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -474,18 +466,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_19 = step1_16 - step1_19; step2_19 = step1_16 - step1_19;
step2_28 = step1_31 - step1_28; step2_28 = step1_31 - step1_28;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_19], %[cospi_8_64] \n\t" "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_28], %[cospi_24_64] \n\t" "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
"extp %[step3_19], $ac0, 31 \n\t" "extp %[step3_19], $ac0, 31 \n\t"
: [step3_19] "=r" (step3_19) : [step3_19] "=r"(step3_19)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
[step2_19] "r" (step2_19), [step2_28] "r" (step2_28), [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -498,18 +489,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_20 = step1_23 - step1_20; step2_20 = step1_23 - step1_20;
step2_27 = step1_24 - step1_27; step2_27 = step1_24 - step1_27;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_20], %[cospi_24_64] \n\t" "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
"msub $ac0, %[step2_27], %[cospi_8_64] \n\t" "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
"extp %[step3_20], $ac0, 31 \n\t" "extp %[step3_20], $ac0, 31 \n\t"
: [step3_20] "=r" (step3_20) : [step3_20] "=r"(step3_20)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
[step2_20] "r" (step2_20), [step2_27] "r" (step2_27), [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -517,18 +507,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_21 = step1_22 - step1_21; step2_21 = step1_22 - step1_21;
step2_26 = step1_25 - step1_26; step2_26 = step1_25 - step1_26;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac1 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t" "mthi $zero, $ac1 \n\t"
"msub $ac1, %[step2_21], %[cospi_24_64] \n\t" "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
"msub $ac1, %[step2_26], %[cospi_8_64] \n\t" "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
"extp %[step3_21], $ac1, 31 \n\t" "extp %[step3_21], $ac1, 31 \n\t"
: [step3_21] "=r" (step3_21) : [step3_21] "=r"(step3_21)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
[step2_21] "r" (step2_21), [step2_26] "r" (step2_26), [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -556,7 +545,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step2_30 = step3_30 + step3_25; step2_30 = step3_30 + step3_25;
step2_31 = step3_31 + step3_24; step2_31 = step3_31 + step3_24;
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t" "lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 32(%[input]) \n\t" "lh %[load2], 32(%[input]) \n\t"
"lh %[load3], 16(%[input]) \n\t" "lh %[load3], 16(%[input]) \n\t"
@ -588,19 +577,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sub %[step1_2], %[temp1], %[temp2] \n\t" "sub %[step1_2], %[temp1], %[temp2] \n\t"
"sub %[step1_3], %[temp0], %[temp3] \n\t" "sub %[step1_3], %[temp0], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
[step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), [step1_3] "=r"(step1_3)
[step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 8(%[input]) \n\t" "lh %[load1], 8(%[input]) \n\t"
"lh %[load2], 56(%[input]) \n\t" "lh %[load2], 56(%[input]) \n\t"
"lh %[load3], 40(%[input]) \n\t" "lh %[load3], 40(%[input]) \n\t"
@ -649,17 +636,15 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"add %[step1_4], %[temp0], %[temp1] \n\t" "add %[step1_4], %[temp0], %[temp1] \n\t"
"add %[step1_7], %[temp3], %[temp2] \n\t" "add %[step1_7], %[temp3], %[temp2] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
[step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), [step1_7] "=r"(step1_7)
[step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
step2_0 = step1_0 + step1_7; step2_0 = step1_0 + step1_7;
step2_1 = step1_1 + step1_6; step2_1 = step1_1 + step1_6;
@ -688,67 +673,63 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
step1_14 = step2_1 - step3_14; step1_14 = step2_1 - step3_14;
step1_15 = step2_0 - step3_15; step1_15 = step2_0 - step3_15;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_27], %[step2_20] \n\t" "sub %[temp0], %[step2_27], %[step2_20] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_20], $ac0, 31 \n\t" "extp %[step1_20], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
: [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20), : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
[step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64) [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
);
temp21 = (step2_20 + step2_27) * cospi_16_64; temp21 = (step2_20 + step2_27) * cospi_16_64;
step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_26], %[step2_21] \n\t" "sub %[temp0], %[step2_26], %[step2_21] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_21], $ac0, 31 \n\t" "extp %[step1_21], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
: [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26), : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
[step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64) [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
);
temp21 = (step2_21 + step2_26) * cospi_16_64; temp21 = (step2_21 + step2_26) * cospi_16_64;
step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_25], %[step2_22] \n\t" "sub %[temp0], %[step2_25], %[step2_22] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_22], $ac0, 31 \n\t" "extp %[step1_22], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
: [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25), : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
[step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64) [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
);
temp21 = (step2_22 + step2_25) * cospi_16_64; temp21 = (step2_22 + step2_25) * cospi_16_64;
step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_24], %[step2_23] \n\t" "sub %[temp0], %[step2_24], %[step2_23] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_23], $ac0, 31 \n\t" "extp %[step1_23], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
: [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24), : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
[step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64) [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
);
temp21 = (step2_23 + step2_24) * cospi_16_64; temp21 = (step2_23 + step2_24) * cospi_16_64;
step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_0], %[step2_31] \n\t" "add %[temp0], %[step1_0], %[step2_31] \n\t"
"addi %[temp0], %[temp0], 32 \n\t" "addi %[temp0], %[temp0], 32 \n\t"
@ -783,21 +764,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t"
"addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
[step1_0] "r" (step1_0), [step1_1] "r" (step1_1), [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
[step1_2] "r" (step1_2), [step1_3] "r" (step1_3), [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
[step2_28] "r" (step2_28), [step2_29] "r" (step2_29), [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
[step2_30] "r" (step2_30), [step2_31] "r" (step2_31) [step2_31] "r"(step2_31));
);
step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t"
@ -820,14 +800,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix1]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t"
"subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
[step3_12] "r" (step3_12), [step3_13] "r" (step3_13), [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
[step3_14] "r" (step3_14), [step3_15] "r" (step3_15) [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_4], %[step1_27] \n\t" "add %[temp0], %[step1_4], %[step1_27] \n\t"
"addi %[temp0], %[temp0], 32 \n\t" "addi %[temp0], %[temp0], 32 \n\t"
@ -862,21 +841,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t"
"addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
[step1_4] "r" (step1_4), [step1_5] "r" (step1_5), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
[step1_6] "r" (step1_6), [step1_7] "r" (step1_7), [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
[step1_24] "r" (step1_24), [step1_25] "r" (step1_25), [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
[step1_26] "r" (step1_26), [step1_27] "r" (step1_27) [step1_27] "r"(step1_27));
);
step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t"
@ -899,14 +877,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix1]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t"
"subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
[step3_12] "r" (step3_12), [step3_13] "r" (step3_13), [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
[step3_14] "r" (step3_14), [step3_15] "r" (step3_15) [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_8], %[step1_23] \n\t" "add %[temp0], %[step1_8], %[step1_23] \n\t"
"addi %[temp0], %[temp0], 32 \n\t" "addi %[temp0], %[temp0], 32 \n\t"
@ -941,21 +918,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t"
"addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
[step1_8] "r" (step1_8), [step1_9] "r" (step1_9), [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
[step1_10] "r" (step1_10), [step1_11] "r" (step1_11), [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
[step1_20] "r" (step1_20), [step1_21] "r" (step1_21), [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
[step1_22] "r" (step1_22), [step1_23] "r" (step1_23) [step1_23] "r"(step1_23));
);
step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t"
@ -978,14 +954,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"sb %[temp1], 0(%[dest_pix1]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t"
"subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
[step3_12] "r" (step3_12), [step3_13] "r" (step3_13), [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
[step3_14] "r" (step3_14), [step3_15] "r" (step3_15) [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix]) \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t"
"add %[temp0], %[step1_12], %[step2_19] \n\t" "add %[temp0], %[step1_12], %[step2_19] \n\t"
"addi %[temp0], %[temp0], 32 \n\t" "addi %[temp0], %[temp0], 32 \n\t"
@ -1019,21 +994,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp1], %[temp3](%[cm]) \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
[step1_12] "r" (step1_12), [step1_13] "r" (step1_13), [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
[step1_14] "r" (step1_14), [step1_15] "r" (step1_15), [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
[step2_16] "r" (step2_16), [step2_17] "r" (step2_17), [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
[step2_18] "r" (step2_18), [step2_19] "r" (step2_19) [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
);
step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
__asm__ __volatile__ ( __asm__ __volatile__(
"lbu %[temp2], 0(%[dest_pix1]) \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t"
"add %[temp2], %[temp2], %[step3_15] \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t"
"lbux %[temp0], %[temp2](%[cm]) \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t"
@ -1055,12 +1029,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[temp1], %[temp3](%[cm]) \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t"
"sb %[temp1], 0(%[dest_pix1]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
[temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
: [cm] "r" (cm), [dest_stride] "r" (dest_stride), : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
[step3_12] "r" (step3_12), [step3_13] "r" (step3_13), [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
[step3_14] "r" (step3_14), [step3_15] "r" (step3_15) [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
);
input += 32; input += 32;
} }

View File

@ -40,7 +40,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
const int const_2_power_13 = 8192; const int const_2_power_13 = 8192;
const int32_t *input_int; const int32_t *input_int;
for (i = no_rows; i--; ) { for (i = no_rows; i--;) {
input_int = (const int32_t *)input; input_int = (const int32_t *)input;
if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
@ -49,7 +49,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
input_int[12] | input_int[13] | input_int[14] | input_int[15])) { input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
input += 32; input += 32;
__asm__ __volatile__ ( __asm__ __volatile__(
"sh $zero, 0(%[output]) \n\t" "sh $zero, 0(%[output]) \n\t"
"sh $zero, 64(%[output]) \n\t" "sh $zero, 64(%[output]) \n\t"
"sh $zero, 128(%[output]) \n\t" "sh $zero, 128(%[output]) \n\t"
@ -84,8 +84,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sh $zero, 1984(%[output]) \n\t" "sh $zero, 1984(%[output]) \n\t"
: :
: [output] "r" (output) : [output] "r"(output));
);
output += 1; output += 1;
@ -96,7 +95,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
prefetch_load((const uint8_t *)(input + 32)); prefetch_load((const uint8_t *)(input + 32));
prefetch_load((const uint8_t *)(input + 48)); prefetch_load((const uint8_t *)(input + 48));
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 2(%[input]) \n\t" "lh %[load1], 2(%[input]) \n\t"
"lh %[load2], 62(%[input]) \n\t" "lh %[load2], 62(%[input]) \n\t"
"lh %[load3], 34(%[input]) \n\t" "lh %[load3], 34(%[input]) \n\t"
@ -146,19 +145,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_16], %[temp0], %[temp1] \n\t" "add %[step1_16], %[temp0], %[temp1] \n\t"
"add %[step1_31], %[temp2], %[temp3] \n\t" "add %[step1_31], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
[step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), [step1_31] "=r"(step1_31)
[step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
[cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
[cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 18(%[input]) \n\t" "lh %[load1], 18(%[input]) \n\t"
"lh %[load2], 46(%[input]) \n\t" "lh %[load2], 46(%[input]) \n\t"
"lh %[load3], 50(%[input]) \n\t" "lh %[load3], 50(%[input]) \n\t"
@ -208,19 +205,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_19], %[temp0], %[temp1] \n\t" "add %[step1_19], %[temp0], %[temp1] \n\t"
"add %[step1_28], %[temp2], %[temp3] \n\t" "add %[step1_28], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
[step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), [step1_29] "=r"(step1_29)
[step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
[cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
[cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 10(%[input]) \n\t" "lh %[load1], 10(%[input]) \n\t"
"lh %[load2], 54(%[input]) \n\t" "lh %[load2], 54(%[input]) \n\t"
"lh %[load3], 42(%[input]) \n\t" "lh %[load3], 42(%[input]) \n\t"
@ -270,19 +265,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_20], %[temp0], %[temp1] \n\t" "add %[step1_20], %[temp0], %[temp1] \n\t"
"add %[step1_27], %[temp2], %[temp3] \n\t" "add %[step1_27], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
[step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), [step1_27] "=r"(step1_27)
[step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
[cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
[cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
[cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 26(%[input]) \n\t" "lh %[load1], 26(%[input]) \n\t"
"lh %[load2], 38(%[input]) \n\t" "lh %[load2], 38(%[input]) \n\t"
"lh %[load3], 58(%[input]) \n\t" "lh %[load3], 58(%[input]) \n\t"
@ -332,19 +325,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_23], %[temp0], %[temp1] \n\t" "add %[step1_23], %[temp0], %[temp1] \n\t"
"add %[step1_24], %[temp2], %[temp3] \n\t" "add %[step1_24], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
[step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), [step1_25] "=r"(step1_25)
[step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
[cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
[cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
[cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 4(%[input]) \n\t" "lh %[load1], 4(%[input]) \n\t"
"lh %[load2], 60(%[input]) \n\t" "lh %[load2], 60(%[input]) \n\t"
"lh %[load3], 36(%[input]) \n\t" "lh %[load3], 36(%[input]) \n\t"
@ -394,19 +385,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_8], %[temp0], %[temp1] \n\t" "add %[step2_8], %[temp0], %[temp1] \n\t"
"add %[step2_15], %[temp2], %[temp3] \n\t" "add %[step2_15], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
[step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), [step2_15] "=r"(step2_15)
[step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
[cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
[cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
[cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 20(%[input]) \n\t" "lh %[load1], 20(%[input]) \n\t"
"lh %[load2], 44(%[input]) \n\t" "lh %[load2], 44(%[input]) \n\t"
"lh %[load3], 52(%[input]) \n\t" "lh %[load3], 52(%[input]) \n\t"
@ -456,19 +445,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step2_11], %[temp0], %[temp1] \n\t" "add %[step2_11], %[temp0], %[temp1] \n\t"
"add %[step2_12], %[temp2], %[temp3] \n\t" "add %[step2_12], %[temp2], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
[step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), [step2_13] "=r"(step2_13)
[step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
[cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
[cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
[cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"sub %[temp0], %[step2_14], %[step2_13] \n\t" "sub %[temp0], %[step2_14], %[step2_13] \n\t"
@ -507,34 +494,31 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_11], $ac2, 31 \n\t"
"extp %[step3_12], $ac3, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t"
: [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
[step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
[step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
[step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
[step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) [step3_15] "=r"(step3_15)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
[step2_8] "r" (step2_8), [step2_9] "r" (step2_9), [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
[step2_10] "r" (step2_10), [step2_11] "r" (step2_11), [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
[step2_12] "r" (step2_12), [step2_13] "r" (step2_13), [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
[step2_14] "r" (step2_14), [step2_15] "r" (step2_15), [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
step2_18 = step1_17 - step1_18; step2_18 = step1_17 - step1_18;
step2_29 = step1_30 - step1_29; step2_29 = step1_30 - step1_29;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_18], %[cospi_8_64] \n\t" "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_29], %[cospi_24_64] \n\t" "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
"extp %[step3_18], $ac0, 31 \n\t" "extp %[step3_18], $ac0, 31 \n\t"
: [step3_18] "=r" (step3_18) : [step3_18] "=r"(step3_18)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
[step2_18] "r" (step2_18), [step2_29] "r" (step2_29), [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -542,18 +526,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_19 = step1_16 - step1_19; step2_19 = step1_16 - step1_19;
step2_28 = step1_31 - step1_28; step2_28 = step1_31 - step1_28;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_19], %[cospi_8_64] \n\t" "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
"madd $ac0, %[step2_28], %[cospi_24_64] \n\t" "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
"extp %[step3_19], $ac0, 31 \n\t" "extp %[step3_19], $ac0, 31 \n\t"
: [step3_19] "=r" (step3_19) : [step3_19] "=r"(step3_19)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
[step2_19] "r" (step2_19), [step2_28] "r" (step2_28), [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -566,18 +549,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_20 = step1_23 - step1_20; step2_20 = step1_23 - step1_20;
step2_27 = step1_24 - step1_27; step2_27 = step1_24 - step1_27;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"msub $ac0, %[step2_20], %[cospi_24_64] \n\t" "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
"msub $ac0, %[step2_27], %[cospi_8_64] \n\t" "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
"extp %[step3_20], $ac0, 31 \n\t" "extp %[step3_20], $ac0, 31 \n\t"
: [step3_20] "=r" (step3_20) : [step3_20] "=r"(step3_20)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
[step2_20] "r" (step2_20), [step2_27] "r" (step2_27), [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -585,18 +567,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_21 = step1_22 - step1_21; step2_21 = step1_22 - step1_21;
step2_26 = step1_25 - step1_26; step2_26 = step1_25 - step1_26;
__asm__ __volatile__ ( __asm__ __volatile__(
"mtlo %[const_2_power_13], $ac1 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t"
"mthi $zero, $ac1 \n\t" "mthi $zero, $ac1 \n\t"
"msub $ac1, %[step2_21], %[cospi_24_64] \n\t" "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
"msub $ac1, %[step2_26], %[cospi_8_64] \n\t" "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
"extp %[step3_21], $ac1, 31 \n\t" "extp %[step3_21], $ac1, 31 \n\t"
: [step3_21] "=r" (step3_21) : [step3_21] "=r"(step3_21)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
[step2_21] "r" (step2_21), [step2_26] "r" (step2_26), [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) [cospi_8_64] "r"(cospi_8_64));
);
temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -624,7 +605,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step2_30 = step3_30 + step3_25; step2_30 = step3_30 + step3_25;
step2_31 = step3_31 + step3_24; step2_31 = step3_31 + step3_24;
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 0(%[input]) \n\t" "lh %[load1], 0(%[input]) \n\t"
"lh %[load2], 32(%[input]) \n\t" "lh %[load2], 32(%[input]) \n\t"
"lh %[load3], 16(%[input]) \n\t" "lh %[load3], 16(%[input]) \n\t"
@ -658,20 +639,19 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"sub %[step1_2], %[temp1], %[temp2] \n\t" "sub %[step1_2], %[temp1], %[temp2] \n\t"
"sub %[step1_3], %[temp0], %[temp3] \n\t" "sub %[step1_3], %[temp0], %[temp3] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [result1] "=&r"(result1),
[result1] "=&r" (result1), [result2] "=&r" (result2), [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
[step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), [step1_3] "=r"(step1_3)
[step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
[cospi_16_64] "r" (cospi_16_64), [cospi_8_64] "r"(cospi_8_64)
[cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
); );
__asm__ __volatile__ ( __asm__ __volatile__(
"lh %[load1], 8(%[input]) \n\t" "lh %[load1], 8(%[input]) \n\t"
"lh %[load2], 56(%[input]) \n\t" "lh %[load2], 56(%[input]) \n\t"
"lh %[load3], 40(%[input]) \n\t" "lh %[load3], 40(%[input]) \n\t"
@ -724,17 +704,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
"add %[step1_4], %[temp0], %[temp1] \n\t" "add %[step1_4], %[temp0], %[temp1] \n\t"
"add %[step1_7], %[temp3], %[temp2] \n\t" "add %[step1_7], %[temp3], %[temp2] \n\t"
: [load1] "=&r" (load1), [load2] "=&r" (load2), : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
[load3] "=&r" (load3), [load4] "=&r" (load4), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
[temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
[temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
[step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), [step1_7] "=r"(step1_7)
[step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
: [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
step2_0 = step1_0 + step1_7; step2_0 = step1_0 + step1_7;
step2_1 = step1_1 + step1_6; step2_1 = step1_1 + step1_6;
@ -762,66 +740,58 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
step1_14 = step2_1 - step3_14; step1_14 = step2_1 - step3_14;
step1_15 = step2_0 - step3_15; step1_15 = step2_0 - step3_15;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_27], %[step2_20] \n\t" "sub %[temp0], %[step2_27], %[step2_20] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_20], $ac0, 31 \n\t" "extp %[step1_20], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
[step2_20] "r" (step2_20), [step2_27] "r" (step2_27), [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
temp21 = (step2_20 + step2_27) * cospi_16_64; temp21 = (step2_20 + step2_27) * cospi_16_64;
step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_26], %[step2_21] \n\t" "sub %[temp0], %[step2_26], %[step2_21] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_21], $ac0, 31 \n\t" "extp %[step1_21], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
[step2_26] "r" (step2_26), [step2_21] "r" (step2_21), [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
temp21 = (step2_21 + step2_26) * cospi_16_64; temp21 = (step2_21 + step2_26) * cospi_16_64;
step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_25], %[step2_22] \n\t" "sub %[temp0], %[step2_25], %[step2_22] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_22], $ac0, 31 \n\t" "extp %[step1_22], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
[step2_25] "r" (step2_25), [step2_22] "r" (step2_22), [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
temp21 = (step2_22 + step2_25) * cospi_16_64; temp21 = (step2_22 + step2_25) * cospi_16_64;
step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
__asm__ __volatile__ ( __asm__ __volatile__(
"sub %[temp0], %[step2_24], %[step2_23] \n\t" "sub %[temp0], %[step2_24], %[step2_23] \n\t"
"mtlo %[const_2_power_13], $ac0 \n\t" "mtlo %[const_2_power_13], $ac0 \n\t"
"mthi $zero, $ac0 \n\t" "mthi $zero, $ac0 \n\t"
"madd $ac0, %[temp0], %[cospi_16_64] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
"extp %[step1_23], $ac0, 31 \n\t" "extp %[step1_23], $ac0, 31 \n\t"
: [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
: [const_2_power_13] "r" (const_2_power_13), : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
[step2_24] "r" (step2_24), [step2_23] "r" (step2_23), [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
[cospi_16_64] "r" (cospi_16_64)
);
temp21 = (step2_23 + step2_24) * cospi_16_64; temp21 = (step2_23 + step2_24) * cospi_16_64;
step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@ -872,11 +842,9 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
// Rows // Rows
idct32_rows_dspr2(input, outptr, 32); idct32_rows_dspr2(input, outptr, 32);
@ -893,17 +861,15 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
// Rows // Rows
idct32_rows_dspr2(input, outptr, 8); idct32_rows_dspr2(input, outptr, 8);
outptr += 8; outptr += 8;
__asm__ __volatile__ ( __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t" "sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 4(%[outptr]) \n\t" "sw $zero, 4(%[outptr]) \n\t"
"sw $zero, 8(%[outptr]) \n\t" "sw $zero, 8(%[outptr]) \n\t"
@ -918,13 +884,12 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 44(%[outptr]) \n\t" "sw $zero, 44(%[outptr]) \n\t"
: :
: [outptr] "r" (outptr) : [outptr] "r"(outptr));
);
for (i = 0; i < 31; ++i) { for (i = 0; i < 31; ++i) {
outptr += 32; outptr += 32;
__asm__ __volatile__ ( __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t" "sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 4(%[outptr]) \n\t" "sw $zero, 4(%[outptr]) \n\t"
"sw $zero, 8(%[outptr]) \n\t" "sw $zero, 8(%[outptr]) \n\t"
@ -939,8 +904,7 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 44(%[outptr]) \n\t" "sw $zero, 44(%[outptr]) \n\t"
: :
: [outptr] "r" (outptr) : [outptr] "r"(outptr));
);
} }
// Columns // Columns
@ -957,35 +921,31 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
__asm__ __volatile__ ( __asm__ __volatile__(
"addi %[out], %[out], 32 \n\t" "addi %[out], %[out], 32 \n\t"
"sra %[a1], %[out], 6 \n\t" "sra %[a1], %[out], 6 \n\t"
: [out] "+r" (out), [a1] "=r" (a1) : [out] "+r"(out), [a1] "=r"(a1)
: :);
);
if (a1 < 0) { if (a1 < 0) {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t" "abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t"
: [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 32; r--;) { for (r = 32; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t" "lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t"
@ -1014,25 +974,22 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
"add %[dest], %[dest], %[stride] \n\t" "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r" (dest) [dest] "+&r"(dest)
: [stride] "r" (stride), [vector_a1] "r" (vector_a1) : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
);
} }
} else { } else {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
"replv.qb %[vector_a1], %[a1] \n\t"
: [vector_a1] "=r" (vector_a1) : [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 32; r--;) { for (r = 32; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t" "lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t"
"lw %[t3], 8(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t"
@ -1061,12 +1018,11 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
"add %[dest], %[dest], %[stride] \n\t" "add %[dest], %[dest], %[stride] \n\t"
: [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
[vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
[vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
[dest] "+&r" (dest) [dest] "+&r"(dest)
: [stride] "r" (stride), [vector_a1] "r" (vector_a1) : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
);
} }
} }
} }

View File

@ -20,8 +20,8 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
const int const_2_power_13 = 8192; const int const_2_power_13 = 8192;
int i; int i;
for (i = 4; i--; ) { for (i = 4; i--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
/* /*
temp_1 = (input[0] + input[2]) * cospi_16_64; temp_1 = (input[0] + input[2]) * cospi_16_64;
step_0 = dct_const_round_shift(temp_1); step_0 = dct_const_round_shift(temp_1);
@ -83,16 +83,12 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
"sub %[Temp3], %[step_0], %[step_3] \n\t" "sub %[Temp3], %[step_0], %[step_3] \n\t"
"sh %[Temp3], 24(%[output]) \n\t" "sh %[Temp3], 24(%[output]) \n\t"
: [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
[step_0] "=&r" (step_0), [step_1] "=&r" (step_1), [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
[step_2] "=&r" (step_2), [step_3] "=&r" (step_3), : [const_2_power_13] "r"(const_2_power_13),
[output] "+r" (output) [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
: [const_2_power_13] "r" (const_2_power_13), [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
[cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
[cospi_24_64] "r" (cospi_24_64),
[input] "r" (input)
);
input += 4; input += 4;
output += 1; output += 1;
@ -121,7 +117,7 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
dest_pix = (dest + i); dest_pix = (dest + i);
__asm__ __volatile__ ( __asm__ __volatile__(
/* /*
temp_1 = (input[0] + input[2]) * cospi_16_64; temp_1 = (input[0] + input[2]) * cospi_16_64;
step_0 = dct_const_round_shift(temp_1); step_0 = dct_const_round_shift(temp_1);
@ -206,16 +202,14 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[Temp2], %[Temp1](%[cm]) \n\t" "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t" "sb %[Temp2], 0(%[dest_pix]) \n\t"
: [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
[step_0] "=&r" (step_0), [step_1] "=&r" (step_1), [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
[step_2] "=&r" (step_2), [step_3] "=&r" (step_3), [dest_pix] "+r"(dest_pix)
[dest_pix] "+r" (dest_pix) : [const_2_power_13] "r"(const_2_power_13),
: [const_2_power_13] "r" (const_2_power_13), [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
[cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
[cospi_24_64] "r" (cospi_24_64), [dest_stride] "r"(dest_stride));
[input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
);
input += 4; input += 4;
} }
@ -228,11 +222,9 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
// Rows // Rows
vpx_idct4_rows_dspr2(input, outptr); vpx_idct4_rows_dspr2(input, outptr);
@ -251,65 +243,55 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
int16_t input_dc = input[0]; int16_t input_dc = input[0];
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
__asm__ __volatile__ ( __asm__ __volatile__(
"addi %[out], %[out], 8 \n\t" "addi %[out], %[out], 8 \n\t"
"sra %[a1], %[out], 4 \n\t" "sra %[a1], %[out], 4 \n\t"
: [out] "+r" (out), [a1] "=r" (a1) : [out] "+r"(out), [a1] "=r"(a1)
: :);
);
if (a1 < 0) { if (a1 < 0) {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t" "abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t"
: [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 4; r--;) { for (r = 4; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t2], 0(%[dest]) \n\t" "lw %[t2], 0(%[dest]) \n\t"
"subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t" "sw %[vector_a], 0(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t" "add %[dest], %[dest], %[dest_stride] \n\t"
: [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
[dest] "+&r" (dest) : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
: [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
);
} }
} else { } else {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
"replv.qb %[vector_a1], %[a1] \n\t" : [vector_a1] "=r"(vector_a1)
: [vector_a1] "=r" (vector_a1) : [a1] "r"(a1));
: [a1] "r" (a1)
);
for (r = 4; r--;) { for (r = 4; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t2], 0(%[dest]) \n\t" "lw %[t2], 0(%[dest]) \n\t"
"addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
"sw %[vector_a], 0(%[dest]) \n\t" "sw %[vector_a], 0(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t" "add %[dest], %[dest], %[dest_stride] \n\t"
: [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
[dest] "+&r" (dest) : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
: [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
);
} }
} }
} }

View File

@ -20,8 +20,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
int Temp0, Temp1, Temp2, Temp3, Temp4; int Temp0, Temp1, Temp2, Temp3, Temp4;
int i; int i;
for (i = no_rows; i--; ) { for (i = no_rows; i--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
/* /*
temp_1 = (input[0] + input[4]) * cospi_16_64; temp_1 = (input[0] + input[4]) * cospi_16_64;
step2_0 = dct_const_round_shift(temp_1); step2_0 = dct_const_round_shift(temp_1);
@ -174,20 +174,18 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
"sub %[Temp1], %[step1_0], %[step1_7] \n\t" "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
"sh %[Temp1], 112(%[output]) \n\t" "sh %[Temp1], 112(%[output]) \n\t"
: [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
[step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
[step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
[step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
[Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
[Temp4] "=&r" (Temp4) : [const_2_power_13] "r"(const_2_power_13),
: [const_2_power_13] "r" (const_2_power_13), [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
[cospi_24_64] "r" (cospi_24_64), [input] "r"(input));
[output] "r" (output), [input] "r" (input)
);
input += 8; input += 8;
output += 1; output += 1;
@ -216,7 +214,7 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
dest_pix = (dest + i); dest_pix = (dest + i);
__asm__ __volatile__ ( __asm__ __volatile__(
/* /*
temp_1 = (input[0] + input[4]) * cospi_16_64; temp_1 = (input[0] + input[4]) * cospi_16_64;
step2_0 = dct_const_round_shift(temp_1); step2_0 = dct_const_round_shift(temp_1);
@ -423,20 +421,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
"lbux %[Temp2], %[Temp1](%[cm]) \n\t" "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
"sb %[Temp2], 0(%[dest_pix]) \n\t" "sb %[Temp2], 0(%[dest_pix]) \n\t"
: [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
[step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
[step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
[step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
[Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
[Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
[dest_pix] "+r" (dest_pix) : [const_2_power_13] "r"(const_2_power_13),
: [const_2_power_13] "r" (const_2_power_13), [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
[cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
[cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
[cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
[cospi_24_64] "r" (cospi_24_64), [dest_stride] "r"(dest_stride));
[input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
);
input += 8; input += 8;
} }
@ -449,11 +445,7 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
"wrdsp %[pos], 1 \n\t"
:
: [pos] "r" (pos)
);
// First transform rows // First transform rows
idct8_rows_dspr2(input, outptr, 8); idct8_rows_dspr2(input, outptr, 8);
@ -469,18 +461,14 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
uint32_t pos = 45; uint32_t pos = 45;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos));
"wrdsp %[pos], 1 \n\t"
:
: [pos] "r" (pos)
);
// First transform rows // First transform rows
idct8_rows_dspr2(input, outptr, 4); idct8_rows_dspr2(input, outptr, 4);
outptr += 4; outptr += 4;
__asm__ __volatile__ ( __asm__ __volatile__(
"sw $zero, 0(%[outptr]) \n\t" "sw $zero, 0(%[outptr]) \n\t"
"sw $zero, 4(%[outptr]) \n\t" "sw $zero, 4(%[outptr]) \n\t"
"sw $zero, 16(%[outptr]) \n\t" "sw $zero, 16(%[outptr]) \n\t"
@ -499,9 +487,7 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
"sw $zero, 116(%[outptr]) \n\t" "sw $zero, 116(%[outptr]) \n\t"
: :
: [outptr] "r" (outptr) : [outptr] "r"(outptr));
);
// Then transform columns and add to dest // Then transform columns and add to dest
idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
@ -516,35 +502,31 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
int32_t t1, t2, vector_a1, vector_1, vector_2; int32_t t1, t2, vector_a1, vector_1, vector_2;
/* bit positon for extract from acc */ /* bit positon for extract from acc */
__asm__ __volatile__ ( __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
"wrdsp %[pos], 1 \n\t"
: :
: [pos] "r" (pos) : [pos] "r"(pos));
);
out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
__asm__ __volatile__ ( __asm__ __volatile__(
"addi %[out], %[out], 16 \n\t" "addi %[out], %[out], 16 \n\t"
"sra %[a1], %[out], 5 \n\t" "sra %[a1], %[out], 5 \n\t"
: [out] "+r" (out), [a1] "=r" (a1) : [out] "+r"(out), [a1] "=r"(a1)
: :);
);
if (a1 < 0) { if (a1 < 0) {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__(
"abs %[absa1], %[a1] \n\t" "abs %[absa1], %[a1] \n\t"
"replv.qb %[vector_a1], %[absa1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t"
: [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 8; r--;) { for (r = 8; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t" "lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t"
"subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
@ -553,24 +535,20 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_2], 4(%[dest]) \n\t" "sw %[vector_2], 4(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t" "add %[dest], %[dest], %[dest_stride] \n\t"
: [t1] "=&r" (t1), [t2] "=&r" (t2), : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
[vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
[dest] "+&r" (dest) : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
: [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
);
} }
} else { } else {
/* use quad-byte /* use quad-byte
* input and output memory are four byte aligned */ * input and output memory are four byte aligned */
__asm__ __volatile__ ( __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
"replv.qb %[vector_a1], %[a1] \n\t"
: [vector_a1] "=r" (vector_a1) : [vector_a1] "=r"(vector_a1)
: [a1] "r" (a1) : [a1] "r"(a1));
);
for (r = 8; r--;) { for (r = 8; r--;) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[t1], 0(%[dest]) \n\t" "lw %[t1], 0(%[dest]) \n\t"
"lw %[t2], 4(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t"
"addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
@ -579,11 +557,9 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
"sw %[vector_2], 4(%[dest]) \n\t" "sw %[vector_2], 4(%[dest]) \n\t"
"add %[dest], %[dest], %[dest_stride] \n\t" "add %[dest], %[dest], %[dest_stride] \n\t"
: [t1] "=&r" (t1), [t2] "=&r" (t2), : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
[vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
[dest] "+r" (dest) : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
: [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
);
} }
} }
} }
@ -602,8 +578,8 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
x7 = input[6]; x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
output[0] = output[1] = output[2] = output[3] = output[4] output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
= output[5] = output[6] = output[7] = 0; output[6] = output[7] = 0;
return; return;
} }

View File

@ -11,8 +11,7 @@
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
#include "vpx_dsp/mips/loopfilter_msa.h" #include "vpx_dsp/mips/loopfilter_msa.h"
int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
uint8_t *filter48,
const uint8_t *b_limit_ptr, const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr, const uint8_t *limit_ptr,
const uint8_t *thresh_ptr) { const uint8_t *thresh_ptr) {
@ -33,8 +32,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */ /* mask and hev */
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
@ -43,9 +42,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
return 1; return 1;
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
@ -107,8 +105,8 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
} else { } else {
src -= 7 * pitch; src -= 7 * pitch;
ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
p2_r_in, p1_r_in, p0_r_in); p2_r_in, p1_r_in, p0_r_in);
q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
@ -408,8 +406,7 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr, const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr, const uint8_t *limit_ptr,
const uint8_t *thresh_ptr, const uint8_t *thresh_ptr, int32_t count) {
int32_t count) {
DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
uint8_t early_exit = 0; uint8_t early_exit = 0;
@ -426,8 +423,7 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
const uint8_t *b_limit_ptr, const uint8_t *b_limit_ptr,
const uint8_t *limit_ptr, const uint8_t *limit_ptr,
const uint8_t *thresh_ptr, const uint8_t *thresh_ptr, int32_t count) {
int32_t count) {
if (1 == count) { if (1 == count) {
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
uint64_t dword0, dword1; uint64_t dword0, dword1;
@ -449,8 +445,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
q1_out); q1_out);
@ -472,9 +468,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
/* convert 16 bit output data into 8 bit */ /* convert 16 bit output data into 8 bit */
PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
q0_filter8);
PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
/* store pixel values */ /* store pixel values */
@ -668,8 +663,8 @@ static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
LD_UB8(input, in_pitch, LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); p1_org, p0_org);
/* 8x8 transpose */ /* 8x8 transpose */
TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
p0_org, p7, p6, p5, p4, p3, p2, p1, p0); p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
@ -699,8 +694,8 @@ static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
} }
static void transpose_16x16(uint8_t *input, int32_t in_pitch, static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
uint8_t *output, int32_t out_pitch) { int32_t out_pitch) {
v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
v16u8 row8, row9, row10, row11, row12, row13, row14, row15; v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@ -709,12 +704,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch,
LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
input += (8 * in_pitch); input += (8 * in_pitch);
LD_UB8(input, in_pitch, LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
row8, row9, row10, row11, row12, row13, row14, row15, row9, row10, row11, row12, row13, row14, row15, p7, p6,
p7, p6, p5, p4, p3, p2, p1, p0); p5, p4, p3, p2, p1, p0);
/* transpose 16x8 matrix into 8x16 */ /* transpose 16x8 matrix into 8x16 */
/* total 8 intermediate register and 32 instructions */ /* total 8 intermediate register and 32 instructions */
@ -779,8 +773,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */ /* mask and hev */
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
/* flat4 */ /* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */ /* filter4 */
@ -794,9 +788,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
return 1; return 1;
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
@ -864,9 +857,9 @@ int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
} else { } else {
src -= 7 * 16; src -= 7 * 16;
ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
p3_r_in, p2_r_in, p1_r_in, p0_r_in); p2_r_in, p1_r_in, p0_r_in);
q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
tmp0_r = p7_r_in << 3; tmp0_r = p7_r_in << 3;
@ -1056,9 +1049,9 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), early_exit =
&filter48[0], src, pitch, b_limit_ptr, vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
limit_ptr, thresh_ptr); pitch, b_limit_ptr, limit_ptr, thresh_ptr);
if (0 == early_exit) { if (0 == early_exit) {
early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
@ -1093,8 +1086,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */ /* mask and hev */
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
/* flat4 */ /* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */ /* filter4 */
@ -1113,9 +1106,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
return 1; return 1;
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
@ -1196,9 +1188,9 @@ int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
} else { } else {
src -= 7 * 16; src -= 7 * 16;
ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
p3_r_in, p2_r_in, p1_r_in, p0_r_in); p2_r_in, p1_r_in, p0_r_in);
q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
tmp0_r = p7_r_in << 3; tmp0_r = p7_r_in << 3;
@ -1479,9 +1471,9 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
transpose_16x16((src - 8), pitch, &transposed_input[0], 16); transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), early_exit =
&filter48[0], src, pitch, b_limit_ptr, vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
limit_ptr, thresh_ptr); pitch, b_limit_ptr, limit_ptr, thresh_ptr);
if (0 == early_exit) { if (0 == early_exit) {
early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,

View File

@ -25,8 +25,8 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
@ -61,8 +61,8 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
hev, mask, flat); mask, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
@ -82,10 +82,10 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
p3, p2, p1, p0, q0, q1, q2, q3); q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
ILVRL_H2_SH(vec1, vec0, vec2, vec3); ILVRL_H2_SH(vec1, vec0, vec2, vec3);
@ -111,12 +111,12 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
LD_UB8(src - 4 + (8 * pitch), pitch, LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
row8, row9, row10, row11, row12, row13, row14, row15); row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
row8, row9, row10, row11, row12, row13, row14, row15, row9, row10, row11, row12, row13, row14, row15, p3, p2,
p3, p2, p1, p0, q0, q1, q2, q3); p1, p0, q0, q1, q2, q3);
thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
@ -130,8 +130,8 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
hev, mask, flat); mask, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);

View File

@ -29,8 +29,8 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
@ -43,16 +43,14 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
q1_d = __msa_copy_u_d((v2i64)q1_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
/* convert 16 bit output data into 8 bit */ /* convert 16 bit output data into 8 bit */
PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
q0_filter8);
PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
/* store pixel values */ /* store pixel values */
@ -80,12 +78,9 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
} }
} }
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_horizontal_8_dual_msa(
const uint8_t *b_limit0, uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
const uint8_t *thresh0,
const uint8_t *b_limit1,
const uint8_t *limit1,
const uint8_t *thresh1) { const uint8_t *thresh1) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
@ -112,17 +107,16 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
/* mask and hev */ /* mask and hev */
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
if (__msa_test_bz_v(flat)) { if (__msa_test_bz_v(flat)) {
ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q2_r, q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
@ -170,16 +164,16 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
/* load vector elements */ /* load vector elements */
LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
p3, p2, p1, p0, q0, q1, q2, q3); q3);
thresh = (v16u8)__msa_fill_b(*thresh_ptr); thresh = (v16u8)__msa_fill_b(*thresh_ptr);
b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
limit = (v16u8)__msa_fill_b(*limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr);
/* mask and hev */ /* mask and hev */
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
/* flat4 */ /* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */ /* filter4 */
@ -197,9 +191,8 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
src += 4 * pitch; src += 4 * pitch;
ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
/* convert 16 bit output data into 8 bit */ /* convert 16 bit output data into 8 bit */
@ -232,11 +225,9 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
} }
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
const uint8_t *b_limit0, const uint8_t *b_limit0, const uint8_t *limit0,
const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *thresh0,
const uint8_t *b_limit1, const uint8_t *b_limit1, const uint8_t *limit1,
const uint8_t *limit1,
const uint8_t *thresh1) { const uint8_t *thresh1) {
uint8_t *temp_src; uint8_t *temp_src;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
@ -257,9 +248,9 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
/* transpose 16x8 matrix into 8x16 */ /* transpose 16x8 matrix into 8x16 */
TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
q3, q2, q1, q0, row12, row13, row14, row15, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
p3, p2, p1, p0, q0, q1, q2, q3); q3);
thresh = (v16u8)__msa_fill_b(*thresh0); thresh = (v16u8)__msa_fill_b(*thresh0);
vec0 = (v8i16)__msa_fill_b(*thresh1); vec0 = (v8i16)__msa_fill_b(*thresh1);
@ -274,8 +265,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
/* mask and hev */ /* mask and hev */
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
hev, mask, flat); mask, flat);
/* flat4 */ /* flat4 */
VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
/* filter4 */ /* filter4 */
@ -292,9 +283,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
src += 8 * pitch; src += 8 * pitch;
ST4x8_UB(vec4, vec5, src, pitch); ST4x8_UB(vec4, vec5, src, pitch);
} else { } else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
q3_r);
VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

View File

@ -19,10 +19,8 @@
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
void vpx_lpf_horizontal_4_dspr2(unsigned char *s, void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
int pitch, const uint8_t *blimit, const uint8_t *limit,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) { const uint8_t *thresh) {
uint8_t i; uint8_t i;
uint32_t mask; uint32_t mask;
@ -37,15 +35,14 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
uthresh = *thresh; uthresh = *thresh;
/* create quad-byte */ /* create quad-byte */
__asm__ __volatile__ ( __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r" (limit_vec) [limit_vec] "=r"(limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
);
/* prefetch data for store */ /* prefetch data for store */
prefetch_store(s); prefetch_store(s);
@ -62,49 +59,44 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
s5 = s4 + pitch; s5 = s4 + pitch;
s6 = s5 + pitch; s6 = s5 + pitch;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[p1], (%[s1]) \n\t" "lw %[p1], (%[s1]) \n\t"
"lw %[p2], (%[s2]) \n\t" "lw %[p2], (%[s2]) \n\t"
"lw %[p3], (%[s3]) \n\t" "lw %[p3], (%[s3]) \n\t"
"lw %[p4], (%[s4]) \n\t" "lw %[p4], (%[s4]) \n\t"
: [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4) : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
: [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
);
/* if (p1 - p4 == 0) and (p2 - p3 == 0) /* if (p1 - p4 == 0) and (p2 - p3 == 0)
mask will be zero and filtering is not needed */ mask will be zero and filtering is not needed */
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[pm1], (%[sm1]) \n\t" "lw %[pm1], (%[sm1]) \n\t"
"lw %[p0], (%[s0]) \n\t" "lw %[p0], (%[s0]) \n\t"
"lw %[p5], (%[s5]) \n\t" "lw %[p5], (%[s5]) \n\t"
"lw %[p6], (%[s6]) \n\t" "lw %[p6], (%[s6]) \n\t"
: [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5), : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
[p6] "=&r" (p6) : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
: [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
);
filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
pm1, p0, p3, p4, p5, p6, p6, thresh_vec, &hev, &mask);
thresh_vec, &hev, &mask);
/* if mask == 0 do filtering is not needed */ /* if mask == 0 do filtering is not needed */
if (mask) { if (mask) {
/* filtering */ /* filtering */
filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[p1], (%[s1]) \n\t" "sw %[p1], (%[s1]) \n\t"
"sw %[p2], (%[s2]) \n\t" "sw %[p2], (%[s2]) \n\t"
"sw %[p3], (%[s3]) \n\t" "sw %[p3], (%[s3]) \n\t"
"sw %[p4], (%[s4]) \n\t" "sw %[p4], (%[s4]) \n\t"
: :
: [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4), : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
[s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
);
} }
} }
@ -112,10 +104,8 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
} }
} }
void vpx_lpf_vertical_4_dspr2(unsigned char *s, void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
int pitch, const uint8_t *blimit, const uint8_t *limit,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) { const uint8_t *thresh) {
uint8_t i; uint8_t i;
uint32_t mask, hev; uint32_t mask, hev;
@ -130,15 +120,14 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
uthresh = *thresh; uthresh = *thresh;
/* create quad-byte */ /* create quad-byte */
__asm__ __volatile__ ( __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r" (limit_vec) [limit_vec] "=r"(limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
);
/* prefetch data for store */ /* prefetch data for store */
prefetch_store(s + pitch); prefetch_store(s + pitch);
@ -163,7 +152,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
p3 = *((uint32_t *)(s4)); p3 = *((uint32_t *)(s4));
/* transpose pm1, p0, p1, p2 */ /* transpose pm1, p0, p1, p2 */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p2], %[p1] \n\t" "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
@ -179,15 +168,13 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
"append %[p1], %[sec3], 16 \n\t" "append %[p1], %[sec3], 16 \n\t"
"append %[pm1], %[sec4], 16 \n\t" "append %[pm1], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
[p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
/* transpose p3, p4, p5, p6 */ /* transpose p3, p4, p5, p6 */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
"precr.qb.ph %[prim2], %[p6], %[p5] \n\t" "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
"precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
@ -203,20 +190,17 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
"append %[p5], %[sec3], 16 \n\t" "append %[p5], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t" "append %[p3], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
[p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
/* if (p1 - p4 == 0) and (p2 - p3 == 0) /* if (p1 - p4 == 0) and (p2 - p3 == 0)
* mask will be zero and filtering is not needed * mask will be zero and filtering is not needed
*/ */
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
p0, p3, p4, p5, p6, thresh_vec, p6, thresh_vec, &hev, &mask);
&hev, &mask);
/* if mask == 0 do filtering is not needed */ /* if mask == 0 do filtering is not needed */
if (mask) { if (mask) {
@ -227,107 +211,93 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
* don't use transpose on output data * don't use transpose on output data
* because memory isn't aligned * because memory isn't aligned
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p4], 1(%[s4]) \n\t" "sb %[p4], 1(%[s4]) \n\t"
"sb %[p3], 0(%[s4]) \n\t" "sb %[p3], 0(%[s4]) \n\t"
"sb %[p2], -1(%[s4]) \n\t" "sb %[p2], -1(%[s4]) \n\t"
"sb %[p1], -2(%[s4]) \n\t" "sb %[p1], -2(%[s4]) \n\t"
: :
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[s4] "r" (s4) [s4] "r"(s4));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t" "srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t" "srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t" "srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t" "srl %[p1], %[p1], 8 \n\t"
: [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
: :);
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p4], 1(%[s3]) \n\t" "sb %[p4], 1(%[s3]) \n\t"
"sb %[p3], 0(%[s3]) \n\t" "sb %[p3], 0(%[s3]) \n\t"
"sb %[p2], -1(%[s3]) \n\t" "sb %[p2], -1(%[s3]) \n\t"
"sb %[p1], -2(%[s3]) \n\t" "sb %[p1], -2(%[s3]) \n\t"
: [p1] "+r" (p1) : [p1] "+r"(p1)
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3) : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t" "srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t" "srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t" "srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t" "srl %[p1], %[p1], 8 \n\t"
: [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
: :);
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p4], 1(%[s2]) \n\t" "sb %[p4], 1(%[s2]) \n\t"
"sb %[p3], 0(%[s2]) \n\t" "sb %[p3], 0(%[s2]) \n\t"
"sb %[p2], -1(%[s2]) \n\t" "sb %[p2], -1(%[s2]) \n\t"
"sb %[p1], -2(%[s2]) \n\t" "sb %[p1], -2(%[s2]) \n\t"
: :
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[s2] "r" (s2) [s2] "r"(s2));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t" "srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t" "srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t" "srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t" "srl %[p1], %[p1], 8 \n\t"
: [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
: :);
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p4], 1(%[s1]) \n\t" "sb %[p4], 1(%[s1]) \n\t"
"sb %[p3], 0(%[s1]) \n\t" "sb %[p3], 0(%[s1]) \n\t"
"sb %[p2], -1(%[s1]) \n\t" "sb %[p2], -1(%[s1]) \n\t"
"sb %[p1], -2(%[s1]) \n\t" "sb %[p1], -2(%[s1]) \n\t"
: :
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[s1] "r" (s1) [s1] "r"(s1));
);
} }
} }
} }
} }
void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, void vpx_lpf_horizontal_4_dual_dspr2(
const uint8_t *blimit0, uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *thresh0, const uint8_t *limit1, const uint8_t *thresh1) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
} }
void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, void vpx_lpf_horizontal_8_dual_dspr2(
const uint8_t *blimit0, uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *thresh0, const uint8_t *limit1, const uint8_t *thresh1) {
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
} }
void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *blimit1,
@ -337,8 +307,7 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
} }
void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *blimit1,
@ -348,8 +317,7 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
} }
void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh) { const uint8_t *thresh) {
vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);

View File

@ -24,9 +24,8 @@ extern "C" {
#if HAVE_DSPR2 #if HAVE_DSPR2
/* inputs & outputs are quad-byte vectors */ /* inputs & outputs are quad-byte vectors */
static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
uint32_t *ps1, uint32_t *ps0, uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
uint32_t *qs0, uint32_t *qs1) {
int32_t vpx_filter_l, vpx_filter_r; int32_t vpx_filter_l, vpx_filter_r;
int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
int32_t subr_r, subr_l; int32_t subr_r, subr_l;
@ -72,7 +71,7 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
hev_r = hev << 8; hev_r = hev << 8;
hev_r = hev_r & HWM; hev_r = hev_r & HWM;
__asm__ __volatile__ ( __asm__ __volatile__(
/* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */ /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
"subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t" "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
"subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t" "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
@ -99,20 +98,17 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
"and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t" "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
"and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t" "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
: [vpx_filter_l] "=&r" (vpx_filter_l), : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
[vpx_filter_r] "=&r" (vpx_filter_r), [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
[subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
[invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
: [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
[vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
[vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
[mask_l] "r" (mask_l), [mask_r] "r" (mask_r), [HWM] "r"(HWM));
[hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
[HWM] "r" (HWM)
);
/* save bottom 3 bits so that we round one side +4 and the other +3 */ /* save bottom 3 bits so that we round one side +4 and the other +3 */
__asm__ __volatile__ ( __asm__ __volatile__(
/* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */ /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
"addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t" "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
"addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t" "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
@ -137,15 +133,14 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
: [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
[Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
[vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
[vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) [vqs0_r] "+r"(vqs0_r)
: [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
[vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r) [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
);
__asm__ __volatile__ ( __asm__ __volatile__(
/* (vpx_filter += 1) >>= 1 */ /* (vpx_filter += 1) >>= 1 */
"addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
"addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
@ -162,11 +157,10 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
: [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
[vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
[vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) [vqs1_r] "+r"(vqs1_r)
: [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
);
/* Create quad-bytes from halfword pairs */ /* Create quad-bytes from halfword pairs */
vqs0_l = vqs0_l & HWM; vqs0_l = vqs0_l & HWM;
@ -174,16 +168,15 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
vps0_l = vps0_l & HWM; vps0_l = vps0_l & HWM;
vps1_l = vps1_l & HWM; vps1_l = vps1_l & HWM;
__asm__ __volatile__ ( __asm__ __volatile__(
"shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
"shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
"shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
"shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
: [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
[vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) [vqs0_r] "+r"(vqs0_r)
: :);
);
vqs0 = vqs0_l | vqs0_r; vqs0 = vqs0_l | vqs0_r;
vqs1 = vqs1_l | vqs1_r; vqs1 = vqs1_l | vqs1_r;
@ -196,9 +189,8 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
*qs1 = vqs1 ^ N128; *qs1 = vqs1 ^ N128;
} }
static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
uint32_t ps1, uint32_t ps0, uint32_t ps0, uint32_t qs0, uint32_t qs1,
uint32_t qs0, uint32_t qs1,
uint32_t *p1_f0, uint32_t *p0_f0, uint32_t *p1_f0, uint32_t *p0_f0,
uint32_t *q0_f0, uint32_t *q1_f0) { uint32_t *q0_f0, uint32_t *q1_f0) {
int32_t vpx_filter_l, vpx_filter_r; int32_t vpx_filter_l, vpx_filter_r;
@ -246,7 +238,7 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
hev_r = hev << 8; hev_r = hev << 8;
hev_r = hev_r & HWM; hev_r = hev_r & HWM;
__asm__ __volatile__ ( __asm__ __volatile__(
/* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */ /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
"subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t" "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
"subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t" "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
@ -273,19 +265,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
"and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t" "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
"and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t" "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
: [vpx_filter_l] "=&r" (vpx_filter_l), : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
[vpx_filter_r] "=&r" (vpx_filter_r), [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
[subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
[invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
: [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
[vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
[vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
[mask_l] "r" (mask_l), [mask_r] "r" (mask_r), [HWM] "r"(HWM));
[hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
);
/* save bottom 3 bits so that we round one side +4 and the other +3 */ /* save bottom 3 bits so that we round one side +4 and the other +3 */
__asm__ __volatile__ ( __asm__ __volatile__(
/* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */ /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
"addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t" "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
"addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t" "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
@ -310,15 +300,14 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
: [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
[Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
[vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
[vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) [vqs0_r] "+r"(vqs0_r)
: [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
[vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r) [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
);
__asm__ __volatile__ ( __asm__ __volatile__(
/* (vpx_filter += 1) >>= 1 */ /* (vpx_filter += 1) >>= 1 */
"addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
"addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
@ -335,11 +324,10 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
"subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
"subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
: [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
[vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
[vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) [vqs1_r] "+r"(vqs1_r)
: [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
);
/* Create quad-bytes from halfword pairs */ /* Create quad-bytes from halfword pairs */
vqs0_l = vqs0_l & HWM; vqs0_l = vqs0_l & HWM;
@ -347,16 +335,15 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
vps0_l = vps0_l & HWM; vps0_l = vps0_l & HWM;
vps1_l = vps1_l & HWM; vps1_l = vps1_l & HWM;
__asm__ __volatile__ ( __asm__ __volatile__(
"shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
"shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
"shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
"shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
: [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
[vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) [vqs0_r] "+r"(vqs0_r)
: :);
);
vqs0 = vqs0_l | vqs0_r; vqs0 = vqs0_l | vqs0_r;
vqs1 = vqs1_l | vqs1_r; vqs1 = vqs1_l | vqs1_r;
@ -369,9 +356,8 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
*q1_f0 = vqs1 ^ N128; *q1_f0 = vqs1 ^ N128;
} }
static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
uint32_t *op1, uint32_t *op0, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
uint32_t *oq0, uint32_t *oq1,
uint32_t *oq2, uint32_t *oq3) { uint32_t *oq2, uint32_t *oq3) {
/* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@ -389,7 +375,7 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
/* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
/* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
__asm__ __volatile__ ( __asm__ __volatile__(
"addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
@ -428,15 +414,12 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
"shrl.ph %[res_op0], %[res_op0], 3 \n\t" "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
"shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
: [add_p210_q012] "=&r" (add_p210_q012), : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
[tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2), [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
[res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0), [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
[res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1), [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
[res_oq2] "=&r" (res_oq2) : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
: [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1), [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
[p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
[u32Four] "r" (u32Four)
);
*op2 = res_op2; *op2 = res_op2;
*op1 = res_op1; *op1 = res_op1;
@ -446,11 +429,9 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
*oq2 = res_oq2; *oq2 = res_oq2;
} }
static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
uint32_t p1, uint32_t p0, uint32_t p0, uint32_t q0, uint32_t q1,
uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3, uint32_t *op2_f1,
uint32_t q2, uint32_t q3,
uint32_t *op2_f1,
uint32_t *op1_f1, uint32_t *op0_f1, uint32_t *op1_f1, uint32_t *op0_f1,
uint32_t *oq0_f1, uint32_t *oq1_f1, uint32_t *oq0_f1, uint32_t *oq1_f1,
uint32_t *oq2_f1) { uint32_t *oq2_f1) {
@ -468,7 +449,7 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
/* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
/* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
__asm__ __volatile__ ( __asm__ __volatile__(
"addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
"addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
@ -507,14 +488,12 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
"shrl.ph %[res_op0], %[res_op0], 3 \n\t" "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
"shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
: [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp), : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
[res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1), [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
[res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0), [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
[res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2) [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
: [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1), : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
[p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3), [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
[u32Four] "r" (u32Four)
);
*op2_f1 = res_op2; *op2_f1 = res_op2;
*op1_f1 = res_op1; *op1_f1 = res_op1;
@ -524,14 +503,11 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
*oq2_f1 = res_oq2; *oq2_f1 = res_oq2;
} }
static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, static INLINE void wide_mbfilter_dspr2(
uint32_t *op5, uint32_t *op4, uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
uint32_t *op3, uint32_t *op2, uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
uint32_t *op1, uint32_t *op0, uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
uint32_t *oq0, uint32_t *oq1, uint32_t *oq7) {
uint32_t *oq2, uint32_t *oq3,
uint32_t *oq4, uint32_t *oq5,
uint32_t *oq6, uint32_t *oq7) {
const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
@ -542,7 +518,7 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
uint32_t add_p6toq6; uint32_t add_p6toq6;
uint32_t u32Eight = 0x00080008; uint32_t u32Eight = 0x00080008;
__asm__ __volatile__ ( __asm__ __volatile__(
/* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
which is used most of the time */ which is used most of the time */
"addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
@ -560,15 +536,13 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
"addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
"addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
: [add_p6toq6] "=&r" (add_p6toq6) : [add_p6toq6] "=&r"(add_p6toq6)
: [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
[p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
[q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3), [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
[q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [u32Eight] "r"(u32Eight));
[u32Eight] "r" (u32Eight)
);
__asm__ __volatile__ ( __asm__ __volatile__(
/* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
p3 + p2 + p1 + p0 + q0, 4) */ p3 + p2 + p1 + p0 + q0, 4) */
"shll.ph %[tmp], %[p7], 3 \n\t" "shll.ph %[tmp], %[p7], 3 \n\t"
@ -643,16 +617,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
"addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
"shrl.ph %[res_op0], %[res_op0], 4 \n\t" "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
: [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5), : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
[res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3), [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
[res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1), [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
[res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp) [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
: [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
[p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
[q2] "r" (q2), [q1] "r" (q1), [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
[q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [add_p6toq6] "r"(add_p6toq6));
[add_p6toq6] "r" (add_p6toq6)
);
*op6 = res_op6; *op6 = res_op6;
*op5 = res_op5; *op5 = res_op5;
@ -662,7 +634,7 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
*op1 = res_op1; *op1 = res_op1;
*op0 = res_op0; *op0 = res_op0;
__asm__ __volatile__ ( __asm__ __volatile__(
/* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
"addu.ph %[res_oq0], %[q7], %[q0] \n\t" "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
@ -737,16 +709,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
"subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
"shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
: [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5), : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
[res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3), [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
[res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
[res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp) [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
: [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
[q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0), [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
[p1] "r" (p1), [p2] "r" (p2), [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
[p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6), [add_p6toq6] "r"(add_p6toq6));
[add_p6toq6] "r" (add_p6toq6)
);
*oq0 = res_oq0; *oq0 = res_oq0;
*oq1 = res_oq1; *oq1 = res_oq1;

View File

@ -22,90 +22,82 @@ extern "C" {
#endif #endif
#if HAVE_DSPR2 #if HAVE_DSPR2
#define STORE_F0() { \ #define STORE_F0() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"sb %[q1_f0], 1(%[s4]) \n\t" \ "sb %[q1_f0], 1(%[s4]) \n\t" \
"sb %[q0_f0], 0(%[s4]) \n\t" \ "sb %[q0_f0], 0(%[s4]) \n\t" \
"sb %[p0_f0], -1(%[s4]) \n\t" \ "sb %[p0_f0], -1(%[s4]) \n\t" \
"sb %[p1_f0], -2(%[s4]) \n\t" \ "sb %[p1_f0], -2(%[s4]) \n\t" \
\ \
: \ : \
: [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
[p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \
[s4] "r" (s4) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q1_f0], %[q1_f0], 8 \n\t" \ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
"srl %[q0_f0], %[q0_f0], 8 \n\t" \ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
"srl %[p0_f0], %[p0_f0], 8 \n\t" \ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
"srl %[p1_f0], %[p1_f0], 8 \n\t" \ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
\ \
: [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
[p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ [p1_f0] "+r"(p1_f0) \
: \ :); \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q1_f0], 1(%[s3]) \n\t" \ "sb %[q1_f0], 1(%[s3]) \n\t" \
"sb %[q0_f0], 0(%[s3]) \n\t" \ "sb %[q0_f0], 0(%[s3]) \n\t" \
"sb %[p0_f0], -1(%[s3]) \n\t" \ "sb %[p0_f0], -1(%[s3]) \n\t" \
"sb %[p1_f0], -2(%[s3]) \n\t" \ "sb %[p1_f0], -2(%[s3]) \n\t" \
\ \
: [p1_f0] "+r" (p1_f0) \ : [p1_f0] "+r"(p1_f0) \
: [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \
[s3] "r" (s3), [p0_f0] "r" (p0_f0) \ [p0_f0] "r"(p0_f0)); \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q1_f0], %[q1_f0], 8 \n\t" \ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
"srl %[q0_f0], %[q0_f0], 8 \n\t" \ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
"srl %[p0_f0], %[p0_f0], 8 \n\t" \ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
"srl %[p1_f0], %[p1_f0], 8 \n\t" \ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
\ \
: [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
[p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ [p1_f0] "+r"(p1_f0) \
: \ :); \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q1_f0], 1(%[s2]) \n\t" \ "sb %[q1_f0], 1(%[s2]) \n\t" \
"sb %[q0_f0], 0(%[s2]) \n\t" \ "sb %[q0_f0], 0(%[s2]) \n\t" \
"sb %[p0_f0], -1(%[s2]) \n\t" \ "sb %[p0_f0], -1(%[s2]) \n\t" \
"sb %[p1_f0], -2(%[s2]) \n\t" \ "sb %[p1_f0], -2(%[s2]) \n\t" \
\ \
: \ : \
: [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
[p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \
[s2] "r" (s2) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q1_f0], %[q1_f0], 8 \n\t" \ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
"srl %[q0_f0], %[q0_f0], 8 \n\t" \ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
"srl %[p0_f0], %[p0_f0], 8 \n\t" \ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
"srl %[p1_f0], %[p1_f0], 8 \n\t" \ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
\ \
: [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
[p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ [p1_f0] "+r"(p1_f0) \
: \ :); \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q1_f0], 1(%[s1]) \n\t" \ "sb %[q1_f0], 1(%[s1]) \n\t" \
"sb %[q0_f0], 0(%[s1]) \n\t" \ "sb %[q0_f0], 0(%[s1]) \n\t" \
"sb %[p0_f0], -1(%[s1]) \n\t" \ "sb %[p0_f0], -1(%[s1]) \n\t" \
"sb %[p1_f0], -2(%[s1]) \n\t" \ "sb %[p1_f0], -2(%[s1]) \n\t" \
\ \
: \ : \
: [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
[p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \
[s1] "r" (s1) \ }
); \
}
#define STORE_F1() { \ #define STORE_F1() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"sb %[q2_r], 2(%[s4]) \n\t" \ "sb %[q2_r], 2(%[s4]) \n\t" \
"sb %[q1_r], 1(%[s4]) \n\t" \ "sb %[q1_r], 1(%[s4]) \n\t" \
"sb %[q0_r], 0(%[s4]) \n\t" \ "sb %[q0_r], 0(%[s4]) \n\t" \
@ -114,12 +106,10 @@ extern "C" {
"sb %[p2_r], -3(%[s4]) \n\t" \ "sb %[p2_r], -3(%[s4]) \n\t" \
\ \
: \ : \
: [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
[p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
[s4] "r" (s4) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q2_r], %[q2_r], 16 \n\t" \ "srl %[q2_r], %[q2_r], 16 \n\t" \
"srl %[q1_r], %[q1_r], 16 \n\t" \ "srl %[q1_r], %[q1_r], 16 \n\t" \
"srl %[q0_r], %[q0_r], 16 \n\t" \ "srl %[q0_r], %[q0_r], 16 \n\t" \
@ -127,12 +117,11 @@ extern "C" {
"srl %[p1_r], %[p1_r], 16 \n\t" \ "srl %[p1_r], %[p1_r], 16 \n\t" \
"srl %[p2_r], %[p2_r], 16 \n\t" \ "srl %[p2_r], %[p2_r], 16 \n\t" \
\ \
: [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r), \ : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \
[p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r) \ [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \
: \ :); \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q2_r], 2(%[s3]) \n\t" \ "sb %[q2_r], 2(%[s3]) \n\t" \
"sb %[q1_r], 1(%[s3]) \n\t" \ "sb %[q1_r], 1(%[s3]) \n\t" \
"sb %[q0_r], 0(%[s3]) \n\t" \ "sb %[q0_r], 0(%[s3]) \n\t" \
@ -141,12 +130,10 @@ extern "C" {
"sb %[p2_r], -3(%[s3]) \n\t" \ "sb %[p2_r], -3(%[s3]) \n\t" \
\ \
: \ : \
: [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \ : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
[p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
[s3] "r" (s3) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q2_l], 2(%[s2]) \n\t" \ "sb %[q2_l], 2(%[s2]) \n\t" \
"sb %[q1_l], 1(%[s2]) \n\t" \ "sb %[q1_l], 1(%[s2]) \n\t" \
"sb %[q0_l], 0(%[s2]) \n\t" \ "sb %[q0_l], 0(%[s2]) \n\t" \
@ -155,12 +142,10 @@ extern "C" {
"sb %[p2_l], -3(%[s2]) \n\t" \ "sb %[p2_l], -3(%[s2]) \n\t" \
\ \
: \ : \
: [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
[p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
[s2] "r" (s2) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q2_l], %[q2_l], 16 \n\t" \ "srl %[q2_l], %[q2_l], 16 \n\t" \
"srl %[q1_l], %[q1_l], 16 \n\t" \ "srl %[q1_l], %[q1_l], 16 \n\t" \
"srl %[q0_l], %[q0_l], 16 \n\t" \ "srl %[q0_l], %[q0_l], 16 \n\t" \
@ -168,12 +153,11 @@ extern "C" {
"srl %[p1_l], %[p1_l], 16 \n\t" \ "srl %[p1_l], %[p1_l], 16 \n\t" \
"srl %[p2_l], %[p2_l], 16 \n\t" \ "srl %[p2_l], %[p2_l], 16 \n\t" \
\ \
: [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l), \ : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \
[p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l) \ [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \
: \ :); \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q2_l], 2(%[s1]) \n\t" \ "sb %[q2_l], 2(%[s1]) \n\t" \
"sb %[q1_l], 1(%[s1]) \n\t" \ "sb %[q1_l], 1(%[s1]) \n\t" \
"sb %[q0_l], 0(%[s1]) \n\t" \ "sb %[q0_l], 0(%[s1]) \n\t" \
@ -182,14 +166,13 @@ extern "C" {
"sb %[p2_l], -3(%[s1]) \n\t" \ "sb %[p2_l], -3(%[s1]) \n\t" \
\ \
: \ : \
: [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \ : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
[p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
[s1] "r" (s1) \ }
); \
}
#define STORE_F2() { \ #define STORE_F2() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"sb %[q6_r], 6(%[s4]) \n\t" \ "sb %[q6_r], 6(%[s4]) \n\t" \
"sb %[q5_r], 5(%[s4]) \n\t" \ "sb %[q5_r], 5(%[s4]) \n\t" \
"sb %[q4_r], 4(%[s4]) \n\t" \ "sb %[q4_r], 4(%[s4]) \n\t" \
@ -206,16 +189,13 @@ extern "C" {
"sb %[p6_r], -7(%[s4]) \n\t" \ "sb %[p6_r], -7(%[s4]) \n\t" \
\ \
: \ : \
: [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
[q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
[q0_r] "r" (q0_r), \ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
[p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
[p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \
[p6_r] "r" (p6_r), \
[s4] "r" (s4) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q6_r], %[q6_r], 16 \n\t" \ "srl %[q6_r], %[q6_r], 16 \n\t" \
"srl %[q5_r], %[q5_r], 16 \n\t" \ "srl %[q5_r], %[q5_r], 16 \n\t" \
"srl %[q4_r], %[q4_r], 16 \n\t" \ "srl %[q4_r], %[q4_r], 16 \n\t" \
@ -231,16 +211,14 @@ extern "C" {
"srl %[p5_r], %[p5_r], 16 \n\t" \ "srl %[p5_r], %[p5_r], 16 \n\t" \
"srl %[p6_r], %[p6_r], 16 \n\t" \ "srl %[p6_r], %[p6_r], 16 \n\t" \
\ \
: [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r), \ : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
[q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), \ [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
[q0_r] "+r" (q0_r), \ [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
[p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r), \ [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
[p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r), \ [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \
[p6_r] "+r" (p6_r) \ :); \
: \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q6_r], 6(%[s3]) \n\t" \ "sb %[q6_r], 6(%[s3]) \n\t" \
"sb %[q5_r], 5(%[s3]) \n\t" \ "sb %[q5_r], 5(%[s3]) \n\t" \
"sb %[q4_r], 4(%[s3]) \n\t" \ "sb %[q4_r], 4(%[s3]) \n\t" \
@ -257,16 +235,13 @@ extern "C" {
"sb %[p6_r], -7(%[s3]) \n\t" \ "sb %[p6_r], -7(%[s3]) \n\t" \
\ \
: \ : \
: [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \ : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
[q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \ [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
[q0_r] "r" (q0_r), \ [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
[p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
[p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \ [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \
[p6_r] "r" (p6_r), \
[s3] "r" (s3) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q6_l], 6(%[s2]) \n\t" \ "sb %[q6_l], 6(%[s2]) \n\t" \
"sb %[q5_l], 5(%[s2]) \n\t" \ "sb %[q5_l], 5(%[s2]) \n\t" \
"sb %[q4_l], 4(%[s2]) \n\t" \ "sb %[q4_l], 4(%[s2]) \n\t" \
@ -283,16 +258,13 @@ extern "C" {
"sb %[p6_l], -7(%[s2]) \n\t" \ "sb %[p6_l], -7(%[s2]) \n\t" \
\ \
: \ : \
: [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
[q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
[q0_l] "r" (q0_l), \ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
[p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
[p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \
[p6_l] "r" (p6_l), \
[s2] "r" (s2) \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"srl %[q6_l], %[q6_l], 16 \n\t" \ "srl %[q6_l], %[q6_l], 16 \n\t" \
"srl %[q5_l], %[q5_l], 16 \n\t" \ "srl %[q5_l], %[q5_l], 16 \n\t" \
"srl %[q4_l], %[q4_l], 16 \n\t" \ "srl %[q4_l], %[q4_l], 16 \n\t" \
@ -308,16 +280,14 @@ extern "C" {
"srl %[p5_l], %[p5_l], 16 \n\t" \ "srl %[p5_l], %[p5_l], 16 \n\t" \
"srl %[p6_l], %[p6_l], 16 \n\t" \ "srl %[p6_l], %[p6_l], 16 \n\t" \
\ \
: [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l), \ : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
[q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), \ [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
[q0_l] "+r" (q0_l), \ [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
[p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l), \ [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
[p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l), \ [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \
[p6_l] "+r" (p6_l) \ :); \
: \
); \
\ \
__asm__ __volatile__ ( \ __asm__ __volatile__( \
"sb %[q6_l], 6(%[s1]) \n\t" \ "sb %[q6_l], 6(%[s1]) \n\t" \
"sb %[q5_l], 5(%[s1]) \n\t" \ "sb %[q5_l], 5(%[s1]) \n\t" \
"sb %[q4_l], 4(%[s1]) \n\t" \ "sb %[q4_l], 4(%[s1]) \n\t" \
@ -334,18 +304,16 @@ extern "C" {
"sb %[p6_l], -7(%[s1]) \n\t" \ "sb %[p6_l], -7(%[s1]) \n\t" \
\ \
: \ : \
: [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \ : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
[q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \ [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
[q0_l] "r" (q0_l), \ [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
[p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
[p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \ [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \
[p6_l] "r" (p6_l), \ }
[s1] "r" (s1) \
); \
}
#define PACK_LEFT_0TO3() { \ #define PACK_LEFT_0TO3() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"preceu.ph.qbl %[p3_l], %[p3] \n\t" \ "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
"preceu.ph.qbl %[p2_l], %[p2] \n\t" \ "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
"preceu.ph.qbl %[p1_l], %[p1] \n\t" \ "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
@ -355,17 +323,16 @@ extern "C" {
"preceu.ph.qbl %[q2_l], %[q2] \n\t" \ "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
"preceu.ph.qbl %[q3_l], %[q3] \n\t" \ "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
\ \
: [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l), \ : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
[p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l), \ [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
[q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l), \ [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \
[q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l) \ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
: [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
[q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \ }
); \
}
#define PACK_LEFT_4TO7() { \ #define PACK_LEFT_4TO7() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"preceu.ph.qbl %[p7_l], %[p7] \n\t" \ "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
"preceu.ph.qbl %[p6_l], %[p6] \n\t" \ "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
"preceu.ph.qbl %[p5_l], %[p5] \n\t" \ "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
@ -375,17 +342,16 @@ extern "C" {
"preceu.ph.qbl %[q6_l], %[q6] \n\t" \ "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
"preceu.ph.qbl %[q7_l], %[q7] \n\t" \ "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
\ \
: [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l), \ : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
[p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l), \ [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
[q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l), \ [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \
[q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l) \ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
: [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
[q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \ }
); \
}
#define PACK_RIGHT_0TO3() { \ #define PACK_RIGHT_0TO3() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"preceu.ph.qbr %[p3_r], %[p3] \n\t" \ "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
"preceu.ph.qbr %[p2_r], %[p2] \n\t" \ "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
"preceu.ph.qbr %[p1_r], %[p1] \n\t" \ "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
@ -395,17 +361,16 @@ extern "C" {
"preceu.ph.qbr %[q2_r], %[q2] \n\t" \ "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
"preceu.ph.qbr %[q3_r], %[q3] \n\t" \ "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
\ \
: [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r), \ : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
[p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r), \ [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
[q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r), \ [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \
[q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r) \ : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
: [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \ [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
[q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \ }
); \
}
#define PACK_RIGHT_4TO7() { \ #define PACK_RIGHT_4TO7() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"preceu.ph.qbr %[p7_r], %[p7] \n\t" \ "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
"preceu.ph.qbr %[p6_r], %[p6] \n\t" \ "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
"preceu.ph.qbr %[p5_r], %[p5] \n\t" \ "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
@ -415,17 +380,16 @@ extern "C" {
"preceu.ph.qbr %[q6_r], %[q6] \n\t" \ "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
"preceu.ph.qbr %[q7_r], %[q7] \n\t" \ "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
\ \
: [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r), \ : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
[p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r), \ [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
[q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r), \ [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \
[q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r) \ : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
: [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \ [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
[q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \ }
); \
}
#define COMBINE_LEFT_RIGHT_0TO2() { \ #define COMBINE_LEFT_RIGHT_0TO2() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
"precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
"precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
@ -433,19 +397,17 @@ extern "C" {
"precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
"precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
\ \
: [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), \ : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
[q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2) \ [q1] "=&r"(q1), [q2] "=&r"(q2) \
: [p2_l] "r" (p2_l), [p2_r] "r" (p2_r), \ : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \
[p1_l] "r" (p1_l), [p1_r] "r" (p1_r), \ [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \
[p0_l] "r" (p0_l), [p0_r] "r" (p0_r), \ [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \
[q0_l] "r" (q0_l), [q0_r] "r" (q0_r), \ [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \
[q1_l] "r" (q1_l), [q1_r] "r" (q1_r), \ }
[q2_l] "r" (q2_l), [q2_r] "r" (q2_r) \
); \
}
#define COMBINE_LEFT_RIGHT_3TO6() { \ #define COMBINE_LEFT_RIGHT_3TO6() \
__asm__ __volatile__ ( \ { \
__asm__ __volatile__( \
"precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
"precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
"precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
@ -455,20 +417,15 @@ extern "C" {
"precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
"precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
\ \
: [p6] "=&r" (p6),[p5] "=&r" (p5), \ : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
[p4] "=&r" (p4),[p3] "=&r" (p3), \ [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \
[q3] "=&r" (q3),[q4] "=&r" (q4), \ : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \
[q5] "=&r" (q5),[q6] "=&r" (q6) \ [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \
: [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), \ [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \
[p4_l] "r" (p4_l), [p3_l] "r" (p3_l), \ [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \
[p6_r] "r" (p6_r), [p5_r] "r" (p5_r), \ [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \
[p4_r] "r" (p4_r), [p3_r] "r" (p3_r), \ [q6_r] "r"(q6_r)); \
[q3_l] "r" (q3_l), [q4_l] "r" (q4_l), \ }
[q5_l] "r" (q5_l), [q6_l] "r" (q6_l), \
[q3_r] "r" (q3_r), [q4_r] "r" (q4_r), \
[q5_r] "r" (q5_r), [q6_r] "r" (q6_r) \
); \
}
#endif // #if HAVE_DSPR2 #endif // #if HAVE_DSPR2
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -25,9 +25,8 @@ extern "C" {
/* processing 4 pixels at the same time /* processing 4 pixels at the same time
* compute hev and mask in the same function */ * compute hev and mask in the same function */
static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
uint32_t p1, uint32_t p0, uint32_t p1, uint32_t p0, uint32_t p3,
uint32_t p3, uint32_t p2, uint32_t p2, uint32_t q0, uint32_t q1,
uint32_t q0, uint32_t q1,
uint32_t q2, uint32_t q3, uint32_t q2, uint32_t q3,
uint32_t thresh, uint32_t *hev, uint32_t thresh, uint32_t *hev,
uint32_t *mask) { uint32_t *mask) {
@ -36,7 +35,7 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
uint32_t ones = 0xFFFFFFFF; uint32_t ones = 0xFFFFFFFF;
uint32_t hev1; uint32_t hev1;
__asm__ __volatile__ ( __asm__ __volatile__(
/* mask |= (abs(p3 - p2) > limit) */ /* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2] \n\t" "subu_s.qb %[c], %[p3], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p3] \n\t" "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
@ -88,14 +87,12 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t" "or %[r], %[r], %[c] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
[r] "=&r" (r), [r3] "=&r" (r3) : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
: [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
[p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), [thresh] "r"(thresh));
[q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
);
__asm__ __volatile__ ( __asm__ __volatile__(
/* abs(p0 - q0) */ /* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0] \n\t" "subu_s.qb %[c], %[p0], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[p0] \n\t" "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
@ -119,26 +116,19 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
"wrdsp %[r] \n\t" "wrdsp %[r] \n\t"
"pick.qb %[s2], $0, %[ones] \n\t" "pick.qb %[s2], $0, %[ones] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1), : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
[s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
: [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
[q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) [ones] "r"(ones), [flimit] "r"(flimit));
);
*hev = hev1; *hev = hev1;
*mask = s2; *mask = s2;
} }
static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit, static INLINE void filter_hev_mask_flatmask4_dspr2(
uint32_t flimit, uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
uint32_t thresh, uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
uint32_t p1, uint32_t p0, uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
uint32_t p3, uint32_t p2,
uint32_t q0, uint32_t q1,
uint32_t q2, uint32_t q3,
uint32_t *hev,
uint32_t *mask,
uint32_t *flat) {
uint32_t c, r, r3, r_k, r_flat; uint32_t c, r, r3, r_k, r_flat;
uint32_t s1, s2, s3; uint32_t s1, s2, s3;
uint32_t ones = 0xFFFFFFFF; uint32_t ones = 0xFFFFFFFF;
@ -146,7 +136,7 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
uint32_t hev1; uint32_t hev1;
uint32_t flat1; uint32_t flat1;
__asm__ __volatile__ ( __asm__ __volatile__(
/* mask |= (abs(p3 - p2) > limit) */ /* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2] \n\t" "subu_s.qb %[c], %[p3], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p3] \n\t" "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
@ -236,15 +226,13 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t" "or %[r], %[r], %[c] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3), : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
[r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1) [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
: [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
[q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh), [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
[flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
);
__asm__ __volatile__ ( __asm__ __volatile__(
/* abs(p0 - q0) */ /* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0] \n\t" "subu_s.qb %[c], %[p0], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[p0] \n\t" "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
@ -268,29 +256,25 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
"wrdsp %[r] \n\t" "wrdsp %[r] \n\t"
"pick.qb %[s2], $0, %[ones] \n\t" "pick.qb %[s2], $0, %[ones] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1), : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
[s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
: [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
[q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) [ones] "r"(ones), [flimit] "r"(flimit));
);
*hev = hev1; *hev = hev1;
*mask = s2; *mask = s2;
*flat = flat1; *flat = flat1;
} }
static INLINE void flatmask5(uint32_t p4, uint32_t p3, static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
uint32_t p2, uint32_t p1, uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
uint32_t p0, uint32_t q0, uint32_t q3, uint32_t q4, uint32_t *flat2) {
uint32_t q1, uint32_t q2,
uint32_t q3, uint32_t q4,
uint32_t *flat2) {
uint32_t c, r, r_k, r_flat; uint32_t c, r, r_k, r_flat;
uint32_t ones = 0xFFFFFFFF; uint32_t ones = 0xFFFFFFFF;
uint32_t flat_thresh = 0x01010101; uint32_t flat_thresh = 0x01010101;
uint32_t flat1, flat3; uint32_t flat1, flat3;
__asm__ __volatile__ ( __asm__ __volatile__(
/* flat |= (abs(p4 - p0) > thresh) */ /* flat |= (abs(p4 - p0) > thresh) */
"subu_s.qb %[c], %[p4], %[p0] \n\t" "subu_s.qb %[c], %[p4], %[p0] \n\t"
"subu_s.qb %[r_k], %[p0], %[p4] \n\t" "subu_s.qb %[r_k], %[p0], %[p4] \n\t"
@ -355,13 +339,11 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3,
/* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
"and %[flat1], %[flat3], %[flat1] \n\t" "and %[flat1], %[flat3], %[flat1] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
[r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3) [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
[p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
[q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
[flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
);
*flat2 = flat1; *flat2 = flat1;
} }

View File

@ -19,10 +19,8 @@
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
void vpx_lpf_horizontal_8_dspr2(unsigned char *s, void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
int pitch, const uint8_t *blimit, const uint8_t *limit,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) { const uint8_t *thresh) {
uint32_t mask; uint32_t mask;
uint32_t hev, flat; uint32_t hev, flat;
@ -40,15 +38,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
uthresh = *thresh; uthresh = *thresh;
/* create quad-byte */ /* create quad-byte */
__asm__ __volatile__ ( __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r" (limit_vec) [limit_vec] "=r"(limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
);
/* prefetch data for store */ /* prefetch data for store */
prefetch_store(s); prefetch_store(s);
@ -63,7 +60,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
sq2 = sq1 + pitch; sq2 = sq1 + pitch;
sq3 = sq2 + pitch; sq3 = sq2 + pitch;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[p3], (%[sp3]) \n\t" "lw %[p3], (%[sp3]) \n\t"
"lw %[p2], (%[sp2]) \n\t" "lw %[p2], (%[sp2]) \n\t"
"lw %[p1], (%[sp1]) \n\t" "lw %[p1], (%[sp1]) \n\t"
@ -73,46 +70,39 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"lw %[q2], (%[sq2]) \n\t" "lw %[q2], (%[sq2]) \n\t"
"lw %[q3], (%[sq3]) \n\t" "lw %[q3], (%[sq3]) \n\t"
: [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
[q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0) [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
: [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
);
filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
p1, p0, p3, p2, q0, q1, q2, q3, p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
&hev, &mask, &flat);
if ((flat == 0) && (mask != 0)) { if ((flat == 0) && (mask != 0)) {
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[p1_f0], (%[sp1]) \n\t" "sw %[p1_f0], (%[sp1]) \n\t"
"sw %[p0_f0], (%[sp0]) \n\t" "sw %[p0_f0], (%[sp0]) \n\t"
"sw %[q0_f0], (%[sq0]) \n\t" "sw %[q0_f0], (%[sq0]) \n\t"
"sw %[q1_f0], (%[sq1]) \n\t" "sw %[q1_f0], (%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} else if ((mask & flat) == 0xFFFFFFFF) { } else if ((mask & flat) == 0xFFFFFFFF) {
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
COMBINE_LEFT_RIGHT_0TO2() COMBINE_LEFT_RIGHT_0TO2()
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[p2], (%[sp2]) \n\t" "sw %[p2], (%[sp2]) \n\t"
"sw %[p1], (%[sp1]) \n\t" "sw %[p1], (%[sp1]) \n\t"
"sw %[p0], (%[sp0]) \n\t" "sw %[p0], (%[sp0]) \n\t"
@ -121,28 +111,23 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sw %[q2], (%[sq2]) \n\t" "sw %[q2], (%[sq2]) \n\t"
: :
: [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
[q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
);
} else if ((flat != 0) && (mask != 0)) { } else if ((flat != 0) && (mask != 0)) {
/* filtering */ /* filtering */
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) { if (mask & flat & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], (%[sp2]) \n\t" "sb %[p2_r], (%[sp2]) \n\t"
"sb %[p1_r], (%[sp1]) \n\t" "sb %[p1_r], (%[sp1]) \n\t"
"sb %[p0_r], (%[sp0]) \n\t" "sb %[p0_r], (%[sp0]) \n\t"
@ -151,27 +136,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_r], (%[sq2]) \n\t" "sb %[q2_r], (%[sq2]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0x000000FF) { } else if (mask & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], (%[sp1]) \n\t" "sb %[p1_f0], (%[sp1]) \n\t"
"sb %[p0_f0], (%[sp0]) \n\t" "sb %[p0_f0], (%[sp0]) \n\t"
"sb %[q0_f0], (%[sq0]) \n\t" "sb %[q0_f0], (%[sq0]) \n\t"
"sb %[q1_f0], (%[sq1]) \n\t" "sb %[q1_f0], (%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t"
@ -183,15 +165,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
[q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0x0000FF00) { if (mask & flat & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], +1(%[sp2]) \n\t" "sb %[p2_r], +1(%[sp2]) \n\t"
"sb %[p1_r], +1(%[sp1]) \n\t" "sb %[p1_r], +1(%[sp1]) \n\t"
"sb %[p0_r], +1(%[sp0]) \n\t" "sb %[p0_r], +1(%[sp0]) \n\t"
@ -200,41 +181,36 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_r], +1(%[sq2]) \n\t" "sb %[q2_r], +1(%[sq2]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0x0000FF00) { } else if (mask & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +1(%[sp1]) \n\t" "sb %[p1_f0], +1(%[sp1]) \n\t"
"sb %[p0_f0], +1(%[sp0]) \n\t" "sb %[p0_f0], +1(%[sp0]) \n\t"
"sb %[q0_f0], +1(%[sq0]) \n\t" "sb %[q0_f0], +1(%[sq0]) \n\t"
"sb %[q1_f0], +1(%[sq1]) \n\t" "sb %[q1_f0], +1(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
[q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) :);
:
);
if (mask & flat & 0x00FF0000) { if (mask & flat & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], +2(%[sp2]) \n\t" "sb %[p2_l], +2(%[sp2]) \n\t"
"sb %[p1_l], +2(%[sp1]) \n\t" "sb %[p1_l], +2(%[sp1]) \n\t"
"sb %[p0_l], +2(%[sp0]) \n\t" "sb %[p0_l], +2(%[sp0]) \n\t"
@ -243,27 +219,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_l], +2(%[sq2]) \n\t" "sb %[q2_l], +2(%[sq2]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0x00FF0000) { } else if (mask & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +2(%[sp1]) \n\t" "sb %[p1_f0], +2(%[sp1]) \n\t"
"sb %[p0_f0], +2(%[sp0]) \n\t" "sb %[p0_f0], +2(%[sp0]) \n\t"
"sb %[q0_f0], +2(%[sq0]) \n\t" "sb %[q0_f0], +2(%[sq0]) \n\t"
"sb %[q1_f0], +2(%[sq1]) \n\t" "sb %[q1_f0], +2(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t"
@ -275,15 +248,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
[q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0xFF000000) { if (mask & flat & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], +3(%[sp2]) \n\t" "sb %[p2_l], +3(%[sp2]) \n\t"
"sb %[p1_l], +3(%[sp1]) \n\t" "sb %[p1_l], +3(%[sp1]) \n\t"
"sb %[p0_l], +3(%[sp0]) \n\t" "sb %[p0_l], +3(%[sp0]) \n\t"
@ -292,24 +264,21 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
"sb %[q2_l], +3(%[sq2]) \n\t" "sb %[q2_l], +3(%[sq2]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0xFF000000) { } else if (mask & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +3(%[sp1]) \n\t" "sb %[p1_f0], +3(%[sp1]) \n\t"
"sb %[p0_f0], +3(%[sp0]) \n\t" "sb %[p0_f0], +3(%[sp0]) \n\t"
"sb %[q0_f0], +3(%[sq0]) \n\t" "sb %[q0_f0], +3(%[sq0]) \n\t"
"sb %[q1_f0], +3(%[sq1]) \n\t" "sb %[q1_f0], +3(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
} }
@ -317,10 +286,8 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
} }
} }
void vpx_lpf_vertical_8_dspr2(unsigned char *s, void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
int pitch, const uint8_t *blimit, const uint8_t *limit,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) { const uint8_t *thresh) {
uint8_t i; uint8_t i;
uint32_t mask, hev, flat; uint32_t mask, hev, flat;
@ -338,15 +305,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
uthresh = *thresh; uthresh = *thresh;
/* create quad-byte */ /* create quad-byte */
__asm__ __volatile__ ( __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r" (limit_vec) [limit_vec] "=r"(limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
);
prefetch_store(s + pitch); prefetch_store(s + pitch);
@ -357,7 +323,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
s4 = s3 + pitch; s4 = s3 + pitch;
s = s4 + pitch; s = s4 + pitch;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[p0], -4(%[s1]) \n\t" "lw %[p0], -4(%[s1]) \n\t"
"lw %[p1], -4(%[s2]) \n\t" "lw %[p1], -4(%[s2]) \n\t"
"lw %[p2], -4(%[s3]) \n\t" "lw %[p2], -4(%[s3]) \n\t"
@ -367,10 +333,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"lw %[q1], (%[s3]) \n\t" "lw %[q1], (%[s3]) \n\t"
"lw %[q0], (%[s4]) \n\t" "lw %[q0], (%[s4]) \n\t"
: [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
[q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3) [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
: [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
);
/* transpose p3, p2, p1, p0 /* transpose p3, p2, p1, p0
original (when loaded from memory) original (when loaded from memory)
@ -387,7 +352,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
p2 p3_1 p2_1 p1_1 p0_1 p2 p3_1 p2_1 p1_1 p0_1
p3 p3_0 p2_0 p1_0 p0_0 p3 p3_0 p2_0 p1_0 p0_0
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p0], %[p1] \n\t" "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
@ -403,12 +368,10 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"append %[p1], %[sec3], 16 \n\t" "append %[p1], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t" "append %[p3], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
[p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
/* transpose q0, q1, q2, q3 /* transpose q0, q1, q2, q3
original (when loaded from memory) original (when loaded from memory)
@ -425,7 +388,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
q1 q0_1 q1_1 q2_1 q3_1 q1 q0_1 q1_1 q2_1 q3_1
q0 q0_0 q1_0 q2_0 q3_0 q0 q0_0 q1_0 q2_0 q3_0
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
"precr.qb.ph %[prim2], %[q3], %[q2] \n\t" "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
"precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
@ -441,49 +404,40 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"append %[q2], %[sec3], 16 \n\t" "append %[q2], %[sec3], 16 \n\t"
"append %[q0], %[sec4], 16 \n\t" "append %[q0], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
[q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
p1, p0, p3, p2, q0, q1, q2, q3, p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
&hev, &mask, &flat);
if ((flat == 0) && (mask != 0)) { if ((flat == 0) && (mask != 0)) {
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
STORE_F0() STORE_F0()
} else if ((mask & flat) == 0xFFFFFFFF) { } else if ((mask & flat) == 0xFFFFFFFF) {
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
STORE_F1() STORE_F1()
} else if ((flat != 0) && (mask != 0)) { } else if ((flat != 0) && (mask != 0)) {
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) { if (mask & flat & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], -3(%[s4]) \n\t" "sb %[p2_r], -3(%[s4]) \n\t"
"sb %[p1_r], -2(%[s4]) \n\t" "sb %[p1_r], -2(%[s4]) \n\t"
"sb %[p0_r], -1(%[s4]) \n\t" "sb %[p0_r], -1(%[s4]) \n\t"
@ -492,25 +446,22 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_r], +2(%[s4]) \n\t" "sb %[q2_r], +2(%[s4]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[s4] "r" (s4) [s4] "r"(s4));
);
} else if (mask & 0x000000FF) { } else if (mask & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s4]) \n\t" "sb %[p1_f0], -2(%[s4]) \n\t"
"sb %[p0_f0], -1(%[s4]) \n\t" "sb %[p0_f0], -1(%[s4]) \n\t"
"sb %[q0_f0], (%[s4]) \n\t" "sb %[q0_f0], (%[s4]) \n\t"
"sb %[q1_f0], +1(%[s4]) \n\t" "sb %[q1_f0], +1(%[s4]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s4] "r"(s4));
[s4] "r" (s4)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t"
@ -522,15 +473,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
[q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0x0000FF00) { if (mask & flat & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], -3(%[s3]) \n\t" "sb %[p2_r], -3(%[s3]) \n\t"
"sb %[p1_r], -2(%[s3]) \n\t" "sb %[p1_r], -2(%[s3]) \n\t"
"sb %[p0_r], -1(%[s3]) \n\t" "sb %[p0_r], -1(%[s3]) \n\t"
@ -539,39 +489,34 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_r], +2(%[s3]) \n\t" "sb %[q2_r], +2(%[s3]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[s3] "r" (s3) [s3] "r"(s3));
);
} else if (mask & 0x0000FF00) { } else if (mask & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s3]) \n\t" "sb %[p1_f0], -2(%[s3]) \n\t"
"sb %[p0_f0], -1(%[s3]) \n\t" "sb %[p0_f0], -1(%[s3]) \n\t"
"sb %[q0_f0], (%[s3]) \n\t" "sb %[q0_f0], (%[s3]) \n\t"
"sb %[q1_f0], +1(%[s3]) \n\t" "sb %[q1_f0], +1(%[s3]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s3] "r"(s3));
[s3] "r" (s3)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
[q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) :);
:
);
if (mask & flat & 0x00FF0000) { if (mask & flat & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], -3(%[s2]) \n\t" "sb %[p2_l], -3(%[s2]) \n\t"
"sb %[p1_l], -2(%[s2]) \n\t" "sb %[p1_l], -2(%[s2]) \n\t"
"sb %[p0_l], -1(%[s2]) \n\t" "sb %[p0_l], -1(%[s2]) \n\t"
@ -580,25 +525,22 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_l], +2(%[s2]) \n\t" "sb %[q2_l], +2(%[s2]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[s2] "r" (s2) [s2] "r"(s2));
);
} else if (mask & 0x00FF0000) { } else if (mask & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s2]) \n\t" "sb %[p1_f0], -2(%[s2]) \n\t"
"sb %[p0_f0], -1(%[s2]) \n\t" "sb %[p0_f0], -1(%[s2]) \n\t"
"sb %[q0_f0], (%[s2]) \n\t" "sb %[q0_f0], (%[s2]) \n\t"
"sb %[q1_f0], +1(%[s2]) \n\t" "sb %[q1_f0], +1(%[s2]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s2] "r"(s2));
[s2] "r" (s2)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t"
@ -610,15 +552,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
[q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0xFF000000) { if (mask & flat & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], -3(%[s1]) \n\t" "sb %[p2_l], -3(%[s1]) \n\t"
"sb %[p1_l], -2(%[s1]) \n\t" "sb %[p1_l], -2(%[s1]) \n\t"
"sb %[p0_l], -1(%[s1]) \n\t" "sb %[p0_l], -1(%[s1]) \n\t"
@ -627,21 +568,19 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
"sb %[q2_l], +2(%[s1]) \n\t" "sb %[q2_l], +2(%[s1]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[s1] "r" (s1) [s1] "r"(s1));
);
} else if (mask & 0xFF000000) { } else if (mask & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s1]) \n\t" "sb %[p1_f0], -2(%[s1]) \n\t"
"sb %[p0_f0], -1(%[s1]) \n\t" "sb %[p0_f0], -1(%[s1]) \n\t"
"sb %[q0_f0], (%[s1]) \n\t" "sb %[q0_f0], (%[s1]) \n\t"
"sb %[q1_f0], +1(%[s1]) \n\t" "sb %[q1_f0], +1(%[s1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q1_f0] "r" (q1_f0), [s1] "r" (s1) [q1_f0] "r"(q1_f0), [s1] "r"(s1));
);
} }
} }
} }

View File

@ -19,12 +19,9 @@
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
static void mb_lpf_horizontal_edge(unsigned char *s, static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
int pitch, const uint8_t *blimit, const uint8_t *limit,
const uint8_t *blimit, const uint8_t *thresh, int count) {
const uint8_t *limit,
const uint8_t *thresh,
int count) {
uint32_t mask; uint32_t mask;
uint32_t hev, flat, flat2; uint32_t hev, flat, flat2;
uint8_t i; uint8_t i;
@ -46,15 +43,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
uthresh = *thresh; uthresh = *thresh;
/* create quad-byte */ /* create quad-byte */
__asm__ __volatile__ ( __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r" (limit_vec) [limit_vec] "=r"(limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
);
/* prefetch data for store */ /* prefetch data for store */
prefetch_store(s); prefetch_store(s);
@ -77,7 +73,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
sq6 = sq5 + pitch; sq6 = sq5 + pitch;
sq7 = sq6 + pitch; sq7 = sq6 + pitch;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[p7], (%[sp7]) \n\t" "lw %[p7], (%[sp7]) \n\t"
"lw %[p6], (%[sp6]) \n\t" "lw %[p6], (%[sp6]) \n\t"
"lw %[p5], (%[sp5]) \n\t" "lw %[p5], (%[sp5]) \n\t"
@ -87,13 +83,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"lw %[p1], (%[sp1]) \n\t" "lw %[p1], (%[sp1]) \n\t"
"lw %[p0], (%[sp0]) \n\t" "lw %[p0], (%[sp0]) \n\t"
: [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
[p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4) [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
: [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7) [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[q0], (%[sq0]) \n\t" "lw %[q0], (%[sq0]) \n\t"
"lw %[q1], (%[sq1]) \n\t" "lw %[q1], (%[sq1]) \n\t"
"lw %[q2], (%[sq2]) \n\t" "lw %[q2], (%[sq2]) \n\t"
@ -103,57 +98,50 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"lw %[q6], (%[sq6]) \n\t" "lw %[q6], (%[sq6]) \n\t"
"lw %[q7], (%[sq7]) \n\t" "lw %[q7], (%[sq7]) \n\t"
: [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0), : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
[q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4) [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
: [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0), : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
[sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7) [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
);
filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
p1, p0, p3, p2, q0, q1, q2, q3, p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
&hev, &mask, &flat);
flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
/* f0 */ /* f0 */
if (((flat2 == 0) && (flat == 0) && (mask != 0)) || if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
((flat2 != 0) && (flat == 0) && (mask != 0))) { ((flat2 != 0) && (flat == 0) && (mask != 0))) {
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[p1_f0], (%[sp1]) \n\t" "sw %[p1_f0], (%[sp1]) \n\t"
"sw %[p0_f0], (%[sp0]) \n\t" "sw %[p0_f0], (%[sp0]) \n\t"
"sw %[q0_f0], (%[sq0]) \n\t" "sw %[q0_f0], (%[sq0]) \n\t"
"sw %[q1_f0], (%[sq1]) \n\t" "sw %[q1_f0], (%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
(mask == 0xFFFFFFFF)) { (mask == 0xFFFFFFFF)) {
/* f2 */ /* f2 */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
PACK_LEFT_4TO7() PACK_LEFT_4TO7()
wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
&p3_l, &p2_l, &p1_l, &p0_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
&q0_l, &q1_l, &q2_l, &q3_l, &q6_l, &q7_l);
&q4_l, &q5_l, &q6_l, &q7_l);
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
PACK_RIGHT_4TO7() PACK_RIGHT_4TO7()
wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
&p3_r, &p2_r, &p1_r, &p0_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
&q0_r, &q1_r, &q2_r, &q3_r, &q6_r, &q7_r);
&q4_r, &q5_r, &q6_r, &q7_r);
COMBINE_LEFT_RIGHT_0TO2() COMBINE_LEFT_RIGHT_0TO2()
COMBINE_LEFT_RIGHT_3TO6() COMBINE_LEFT_RIGHT_3TO6()
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[p6], (%[sp6]) \n\t" "sw %[p6], (%[sp6]) \n\t"
"sw %[p5], (%[sp5]) \n\t" "sw %[p5], (%[sp5]) \n\t"
"sw %[p4], (%[sp4]) \n\t" "sw %[p4], (%[sp4]) \n\t"
@ -163,13 +151,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sw %[p0], (%[sp0]) \n\t" "sw %[p0], (%[sp0]) \n\t"
: :
: [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
[p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
[sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) [sp1] "r"(sp1), [sp0] "r"(sp0));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[q6], (%[sq6]) \n\t" "sw %[q6], (%[sq6]) \n\t"
"sw %[q5], (%[sq5]) \n\t" "sw %[q5], (%[sq5]) \n\t"
"sw %[q4], (%[sq4]) \n\t" "sw %[q4], (%[sq4]) \n\t"
@ -179,26 +166,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sw %[q0], (%[sq0]) \n\t" "sw %[q0], (%[sq0]) \n\t"
: :
: [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3), : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
[q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0), [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
[sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3), [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
[sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) [sq1] "r"(sq1), [sq0] "r"(sq0));
);
} else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
/* f1 */ /* f1 */
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
COMBINE_LEFT_RIGHT_0TO2() COMBINE_LEFT_RIGHT_0TO2()
__asm__ __volatile__ ( __asm__ __volatile__(
"sw %[p2], (%[sp2]) \n\t" "sw %[p2], (%[sp2]) \n\t"
"sw %[p1], (%[sp1]) \n\t" "sw %[p1], (%[sp1]) \n\t"
"sw %[p0], (%[sp0]) \n\t" "sw %[p0], (%[sp0]) \n\t"
@ -207,28 +191,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sw %[q2], (%[sq2]) \n\t" "sw %[q2], (%[sq2]) \n\t"
: :
: [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
[q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
);
} else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
/* f0+f1 */ /* f0+f1 */
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) { if (mask & flat & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], (%[sp2]) \n\t" "sb %[p2_r], (%[sp2]) \n\t"
"sb %[p1_r], (%[sp1]) \n\t" "sb %[p1_r], (%[sp1]) \n\t"
"sb %[p0_r], (%[sp0]) \n\t" "sb %[p0_r], (%[sp0]) \n\t"
@ -237,27 +216,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r], (%[sq2]) \n\t" "sb %[q2_r], (%[sq2]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0x000000FF) { } else if (mask & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], (%[sp1]) \n\t" "sb %[p1_f0], (%[sp1]) \n\t"
"sb %[p0_f0], (%[sp0]) \n\t" "sb %[p0_f0], (%[sp0]) \n\t"
"sb %[q0_f0], (%[sq0]) \n\t" "sb %[q0_f0], (%[sq0]) \n\t"
"sb %[q1_f0], (%[sq1]) \n\t" "sb %[q1_f0], (%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t"
@ -269,15 +245,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
[q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0x0000FF00) { if (mask & flat & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], +1(%[sp2]) \n\t" "sb %[p2_r], +1(%[sp2]) \n\t"
"sb %[p1_r], +1(%[sp1]) \n\t" "sb %[p1_r], +1(%[sp1]) \n\t"
"sb %[p0_r], +1(%[sp0]) \n\t" "sb %[p0_r], +1(%[sp0]) \n\t"
@ -286,39 +261,35 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r], +1(%[sq2]) \n\t" "sb %[q2_r], +1(%[sq2]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0x0000FF00) { } else if (mask & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +1(%[sp1]) \n\t" "sb %[p1_f0], +1(%[sp1]) \n\t"
"sb %[p0_f0], +1(%[sp0]) \n\t" "sb %[p0_f0], +1(%[sp0]) \n\t"
"sb %[q0_f0], +1(%[sq0]) \n\t" "sb %[q0_f0], +1(%[sq0]) \n\t"
"sb %[q1_f0], +1(%[sq1]) \n\t" "sb %[q1_f0], +1(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0x00FF0000) { if (mask & flat & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], +2(%[sp2]) \n\t" "sb %[p2_l], +2(%[sp2]) \n\t"
"sb %[p1_l], +2(%[sp1]) \n\t" "sb %[p1_l], +2(%[sp1]) \n\t"
"sb %[p0_l], +2(%[sp0]) \n\t" "sb %[p0_l], +2(%[sp0]) \n\t"
@ -327,27 +298,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l], +2(%[sq2]) \n\t" "sb %[q2_l], +2(%[sq2]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0x00FF0000) { } else if (mask & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +2(%[sp1]) \n\t" "sb %[p1_f0], +2(%[sp1]) \n\t"
"sb %[p0_f0], +2(%[sp0]) \n\t" "sb %[p0_f0], +2(%[sp0]) \n\t"
"sb %[q0_f0], +2(%[sq0]) \n\t" "sb %[q0_f0], +2(%[sq0]) \n\t"
"sb %[q1_f0], +2(%[sq1]) \n\t" "sb %[q1_f0], +2(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t"
@ -359,15 +327,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
[q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0xFF000000) { if (mask & flat & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], +3(%[sp2]) \n\t" "sb %[p2_l], +3(%[sp2]) \n\t"
"sb %[p1_l], +3(%[sp1]) \n\t" "sb %[p1_l], +3(%[sp1]) \n\t"
"sb %[p0_l], +3(%[sp0]) \n\t" "sb %[p0_l], +3(%[sp0]) \n\t"
@ -376,61 +343,51 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l], +3(%[sq2]) \n\t" "sb %[q2_l], +3(%[sq2]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq1] "r"(sq1), [sq2] "r"(sq2));
);
} else if (mask & 0xFF000000) { } else if (mask & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +3(%[sp1]) \n\t" "sb %[p1_f0], +3(%[sp1]) \n\t"
"sb %[p0_f0], +3(%[sp0]) \n\t" "sb %[p0_f0], +3(%[sp0]) \n\t"
"sb %[q0_f0], +3(%[sq0]) \n\t" "sb %[q0_f0], +3(%[sq0]) \n\t"
"sb %[q1_f0], +3(%[sq1]) \n\t" "sb %[q1_f0], +3(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
} else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
/* f0 + f1 + f2 */ /* f0 + f1 + f2 */
/* f0 function */ /* f0 function */
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* f1 function */ /* f1 function */
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
q0_l, q1_l, q2_l, q3_l, &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
&p2_l_f1, &p1_l_f1, &p0_l_f1,
&q0_l_f1, &q1_l_f1, &q2_l_f1);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
q0_r, q1_r, q2_r, q3_r, &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
&p2_r_f1, &p1_r_f1, &p0_r_f1,
&q0_r_f1, &q1_r_f1, &q2_r_f1);
/* f2 function */ /* f2 function */
PACK_LEFT_4TO7() PACK_LEFT_4TO7()
wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
&p3_l, &p2_l, &p1_l, &p0_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
&q0_l, &q1_l, &q2_l, &q3_l, &q6_l, &q7_l);
&q4_l, &q5_l, &q6_l, &q7_l);
PACK_RIGHT_4TO7() PACK_RIGHT_4TO7()
wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
&p3_r, &p2_r, &p1_r, &p0_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
&q0_r, &q1_r, &q2_r, &q3_r, &q6_r, &q7_r);
&q4_r, &q5_r, &q6_r, &q7_r);
if (mask & flat & flat2 & 0x000000FF) { if (mask & flat & flat2 & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_r], (%[sp6]) \n\t" "sb %[p6_r], (%[sp6]) \n\t"
"sb %[p5_r], (%[sp5]) \n\t" "sb %[p5_r], (%[sp5]) \n\t"
"sb %[p4_r], (%[sp4]) \n\t" "sb %[p4_r], (%[sp4]) \n\t"
@ -440,14 +397,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_r], (%[sp0]) \n\t" "sb %[p0_r], (%[sp0]) \n\t"
: :
: [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
[p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
[sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
[sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
[p0_r] "r" (p0_r), [sp0] "r" (sp0)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_r], (%[sq0]) \n\t" "sb %[q0_r], (%[sq0]) \n\t"
"sb %[q1_r], (%[sq1]) \n\t" "sb %[q1_r], (%[sq1]) \n\t"
"sb %[q2_r], (%[sq2]) \n\t" "sb %[q2_r], (%[sq2]) \n\t"
@ -457,15 +412,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_r], (%[sq6]) \n\t" "sb %[q6_r], (%[sq6]) \n\t"
: :
: [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
[q6_r] "r" (q6_r), [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2), [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
[sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
[sq6] "r" (sq6)
);
} else if (mask & flat & 0x000000FF) { } else if (mask & flat & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r_f1], (%[sp2]) \n\t" "sb %[p2_r_f1], (%[sp2]) \n\t"
"sb %[p1_r_f1], (%[sp1]) \n\t" "sb %[p1_r_f1], (%[sp1]) \n\t"
"sb %[p0_r_f1], (%[sp0]) \n\t" "sb %[p0_r_f1], (%[sp0]) \n\t"
@ -474,27 +426,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r_f1], (%[sq2]) \n\t" "sb %[q2_r_f1], (%[sq2]) \n\t"
: :
: [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
[p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
[q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq2] "r"(sq2));
);
} else if (mask & 0x000000FF) { } else if (mask & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], (%[sp1]) \n\t" "sb %[p1_f0], (%[sp1]) \n\t"
"sb %[p0_f0], (%[sp0]) \n\t" "sb %[p0_f0], (%[sp0]) \n\t"
"sb %[q0_f0], (%[sq0]) \n\t" "sb %[q0_f0], (%[sq0]) \n\t"
"sb %[q1_f0], (%[sq1]) \n\t" "sb %[q1_f0], (%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sq0] "r" (sq0), [sq1] "r" (sq1) [sq0] "r"(sq0), [sq1] "r"(sq1));
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p6_r], %[p6_r], 16 \n\t" "srl %[p6_r], %[p6_r], 16 \n\t"
"srl %[p5_r], %[p5_r], 16 \n\t" "srl %[p5_r], %[p5_r], 16 \n\t"
"srl %[p4_r], %[p4_r], 16 \n\t" "srl %[p4_r], %[p4_r], 16 \n\t"
@ -510,15 +460,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q5_r], %[q5_r], 16 \n\t" "srl %[q5_r], %[q5_r], 16 \n\t"
"srl %[q6_r], %[q6_r], 16 \n\t" "srl %[q6_r], %[q6_r], 16 \n\t"
: [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
[q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r), [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
[p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
[p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
[q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r) [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
: :);
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
"srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
"srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
@ -530,16 +479,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1), : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
[p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1), [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
[q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1), [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & flat2 & 0x0000FF00) { if (mask & flat & flat2 & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_r], +1(%[sp6]) \n\t" "sb %[p6_r], +1(%[sp6]) \n\t"
"sb %[p5_r], +1(%[sp5]) \n\t" "sb %[p5_r], +1(%[sp5]) \n\t"
"sb %[p4_r], +1(%[sp4]) \n\t" "sb %[p4_r], +1(%[sp4]) \n\t"
@ -549,14 +497,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_r], +1(%[sp0]) \n\t" "sb %[p0_r], +1(%[sp0]) \n\t"
: :
: [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
[p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
[p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5), [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
[sp4] "r" (sp4), [sp3] "r" (sp3), [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_r], +1(%[sq0]) \n\t" "sb %[q0_r], +1(%[sq0]) \n\t"
"sb %[q1_r], +1(%[sq1]) \n\t" "sb %[q1_r], +1(%[sq1]) \n\t"
"sb %[q2_r], +1(%[sq2]) \n\t" "sb %[q2_r], +1(%[sq2]) \n\t"
@ -566,14 +512,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_r], +1(%[sq6]) \n\t" "sb %[q6_r], +1(%[sq6]) \n\t"
: :
: [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
[q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1), [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
[sq2] "r" (sq2), [sq3] "r" (sq3), [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
[sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
);
} else if (mask & flat & 0x0000FF00) { } else if (mask & flat & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r_f1], +1(%[sp2]) \n\t" "sb %[p2_r_f1], +1(%[sp2]) \n\t"
"sb %[p1_r_f1], +1(%[sp1]) \n\t" "sb %[p1_r_f1], +1(%[sp1]) \n\t"
"sb %[p0_r_f1], +1(%[sp0]) \n\t" "sb %[p0_r_f1], +1(%[sp0]) \n\t"
@ -582,39 +526,36 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_r_f1], +1(%[sq2]) \n\t" "sb %[q2_r_f1], +1(%[sq2]) \n\t"
: :
: [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
[p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
[q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq2] "r"(sq2));
);
} else if (mask & 0x0000FF00) { } else if (mask & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +1(%[sp1]) \n\t" "sb %[p1_f0], +1(%[sp1]) \n\t"
"sb %[p0_f0], +1(%[sp0]) \n\t" "sb %[p0_f0], +1(%[sp0]) \n\t"
"sb %[q0_f0], +1(%[sq0]) \n\t" "sb %[q0_f0], +1(%[sq0]) \n\t"
"sb %[q1_f0], +1(%[sq1]) \n\t" "sb %[q1_f0], +1(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sq0] "r" (sq0), [sq1] "r" (sq1) [sq0] "r"(sq0), [sq1] "r"(sq1));
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & flat2 & 0x00FF0000) { if (mask & flat & flat2 & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_l], +2(%[sp6]) \n\t" "sb %[p6_l], +2(%[sp6]) \n\t"
"sb %[p5_l], +2(%[sp5]) \n\t" "sb %[p5_l], +2(%[sp5]) \n\t"
"sb %[p4_l], +2(%[sp4]) \n\t" "sb %[p4_l], +2(%[sp4]) \n\t"
@ -624,14 +565,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_l], +2(%[sp0]) \n\t" "sb %[p0_l], +2(%[sp0]) \n\t"
: :
: [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
[p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
[p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5), [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
[sp4] "r" (sp4), [sp3] "r" (sp3), [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_l], +2(%[sq0]) \n\t" "sb %[q0_l], +2(%[sq0]) \n\t"
"sb %[q1_l], +2(%[sq1]) \n\t" "sb %[q1_l], +2(%[sq1]) \n\t"
"sb %[q2_l], +2(%[sq2]) \n\t" "sb %[q2_l], +2(%[sq2]) \n\t"
@ -641,14 +580,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_l], +2(%[sq6]) \n\t" "sb %[q6_l], +2(%[sq6]) \n\t"
: :
: [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
[q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1), [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
[sq2] "r" (sq2), [sq3] "r" (sq3), [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
[sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
);
} else if (mask & flat & 0x00FF0000) { } else if (mask & flat & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l_f1], +2(%[sp2]) \n\t" "sb %[p2_l_f1], +2(%[sp2]) \n\t"
"sb %[p1_l_f1], +2(%[sp1]) \n\t" "sb %[p1_l_f1], +2(%[sp1]) \n\t"
"sb %[p0_l_f1], +2(%[sp0]) \n\t" "sb %[p0_l_f1], +2(%[sp0]) \n\t"
@ -657,27 +594,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l_f1], +2(%[sq2]) \n\t" "sb %[q2_l_f1], +2(%[sq2]) \n\t"
: :
: [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
[p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
[q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq2] "r"(sq2));
);
} else if (mask & 0x00FF0000) { } else if (mask & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +2(%[sp1]) \n\t" "sb %[p1_f0], +2(%[sp1]) \n\t"
"sb %[p0_f0], +2(%[sp0]) \n\t" "sb %[p0_f0], +2(%[sp0]) \n\t"
"sb %[q0_f0], +2(%[sq0]) \n\t" "sb %[q0_f0], +2(%[sq0]) \n\t"
"sb %[q1_f0], +2(%[sq1]) \n\t" "sb %[q1_f0], +2(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sq0] "r" (sq0), [sq1] "r" (sq1) [sq0] "r"(sq0), [sq1] "r"(sq1));
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p6_l], %[p6_l], 16 \n\t" "srl %[p6_l], %[p6_l], 16 \n\t"
"srl %[p5_l], %[p5_l], 16 \n\t" "srl %[p5_l], %[p5_l], 16 \n\t"
"srl %[p4_l], %[p4_l], 16 \n\t" "srl %[p4_l], %[p4_l], 16 \n\t"
@ -693,15 +628,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q5_l], %[q5_l], 16 \n\t" "srl %[q5_l], %[q5_l], 16 \n\t"
"srl %[q6_l], %[q6_l], 16 \n\t" "srl %[q6_l], %[q6_l], 16 \n\t"
: [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
[q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l), [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
[q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l), [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
[p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l), [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
[p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l) [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
: :);
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
"srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
"srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
@ -713,16 +647,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1), : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
[p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1), [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
[q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1), [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & flat2 & 0xFF000000) { if (mask & flat & flat2 & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_l], +3(%[sp6]) \n\t" "sb %[p6_l], +3(%[sp6]) \n\t"
"sb %[p5_l], +3(%[sp5]) \n\t" "sb %[p5_l], +3(%[sp5]) \n\t"
"sb %[p4_l], +3(%[sp4]) \n\t" "sb %[p4_l], +3(%[sp4]) \n\t"
@ -732,14 +665,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[p0_l], +3(%[sp0]) \n\t" "sb %[p0_l], +3(%[sp0]) \n\t"
: :
: [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
[p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
[p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5), [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
[sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2), [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
[sp1] "r" (sp1), [sp0] "r" (sp0)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_l], +3(%[sq0]) \n\t" "sb %[q0_l], +3(%[sq0]) \n\t"
"sb %[q1_l], +3(%[sq1]) \n\t" "sb %[q1_l], +3(%[sq1]) \n\t"
"sb %[q2_l], +3(%[sq2]) \n\t" "sb %[q2_l], +3(%[sq2]) \n\t"
@ -749,15 +680,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q6_l], +3(%[sq6]) \n\t" "sb %[q6_l], +3(%[sq6]) \n\t"
: :
: [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[q2_l] "r" (q2_l), [q3_l] "r" (q3_l), [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
[q4_l] "r" (q4_l), [q5_l] "r" (q5_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2), [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
[sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
[q6_l] "r" (q6_l), [sq6] "r" (sq6)
);
} else if (mask & flat & 0xFF000000) { } else if (mask & flat & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l_f1], +3(%[sp2]) \n\t" "sb %[p2_l_f1], +3(%[sp2]) \n\t"
"sb %[p1_l_f1], +3(%[sp1]) \n\t" "sb %[p1_l_f1], +3(%[sp1]) \n\t"
"sb %[p0_l_f1], +3(%[sp0]) \n\t" "sb %[p0_l_f1], +3(%[sp0]) \n\t"
@ -766,25 +694,22 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
"sb %[q2_l_f1], +3(%[sq2]) \n\t" "sb %[q2_l_f1], +3(%[sq2]) \n\t"
: :
: [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
[p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
[q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
[sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
[sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) [sq2] "r"(sq2));
);
} else if (mask & 0xFF000000) { } else if (mask & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], +3(%[sp1]) \n\t" "sb %[p1_f0], +3(%[sp1]) \n\t"
"sb %[p0_f0], +3(%[sp0]) \n\t" "sb %[p0_f0], +3(%[sp0]) \n\t"
"sb %[q0_f0], +3(%[sq0]) \n\t" "sb %[q0_f0], +3(%[sq0]) \n\t"
"sb %[q1_f0], +3(%[sq1]) \n\t" "sb %[q1_f0], +3(%[sq1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
[sp1] "r" (sp1), [sp0] "r" (sp0), [sq0] "r"(sq0), [sq1] "r"(sq1));
[sq0] "r" (sq0), [sq1] "r" (sq1)
);
} }
} }

View File

@ -19,11 +19,8 @@
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#if HAVE_DSPR2 #if HAVE_DSPR2
void vpx_lpf_vertical_16_dspr2(uint8_t *s, void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
int pitch, const uint8_t *limit, const uint8_t *thresh) {
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
uint8_t i; uint8_t i;
uint32_t mask, hev, flat, flat2; uint32_t mask, hev, flat, flat2;
uint8_t *s1, *s2, *s3, *s4; uint8_t *s1, *s2, *s3, *s4;
@ -44,15 +41,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
uthresh = *thresh; uthresh = *thresh;
/* create quad-byte */ /* create quad-byte */
__asm__ __volatile__ ( __asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r" (limit_vec) [limit_vec] "=r"(limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
);
prefetch_store(s + pitch); prefetch_store(s + pitch);
@ -63,7 +59,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
s4 = s3 + pitch; s4 = s3 + pitch;
s = s4 + pitch; s = s4 + pitch;
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[p0], -4(%[s1]) \n\t" "lw %[p0], -4(%[s1]) \n\t"
"lw %[p1], -4(%[s2]) \n\t" "lw %[p1], -4(%[s2]) \n\t"
"lw %[p2], -4(%[s3]) \n\t" "lw %[p2], -4(%[s3]) \n\t"
@ -73,13 +69,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"lw %[p6], -8(%[s3]) \n\t" "lw %[p6], -8(%[s3]) \n\t"
"lw %[p7], -8(%[s4]) \n\t" "lw %[p7], -8(%[s4]) \n\t"
: [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
[p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6), [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
[p5] "=&r" (p5), [p4] "=&r" (p4) : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
: [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"lw %[q3], (%[s1]) \n\t" "lw %[q3], (%[s1]) \n\t"
"lw %[q2], (%[s2]) \n\t" "lw %[q2], (%[s2]) \n\t"
"lw %[q1], (%[s3]) \n\t" "lw %[q1], (%[s3]) \n\t"
@ -89,11 +83,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"lw %[q5], +4(%[s3]) \n\t" "lw %[q5], +4(%[s3]) \n\t"
"lw %[q4], +4(%[s4]) \n\t" "lw %[q4], +4(%[s4]) \n\t"
: [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
[q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6), [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
[q5] "=&r" (q5), [q4] "=&r" (q4) : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
: [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
);
/* transpose p3, p2, p1, p0 /* transpose p3, p2, p1, p0
original (when loaded from memory) original (when loaded from memory)
@ -110,7 +102,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
p2 p3_1 p2_1 p1_1 p0_1 p2 p3_1 p2_1 p1_1 p0_1
p3 p3_0 p2_0 p1_0 p0_0 p3 p3_0 p2_0 p1_0 p0_0
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p0], %[p1] \n\t" "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
@ -126,12 +118,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[p1], %[sec3], 16 \n\t" "append %[p1], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t" "append %[p3], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
[p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
/* transpose q0, q1, q2, q3 /* transpose q0, q1, q2, q3
original (when loaded from memory) original (when loaded from memory)
@ -148,7 +138,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
q1 q0_1 q1_1 q2_1 q3_1 q1 q0_1 q1_1 q2_1 q3_1
q0 q0_0 q1_0 q2_0 q3_0 q0 q0_0 q1_0 q2_0 q3_0
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
"precr.qb.ph %[prim2], %[q3], %[q2] \n\t" "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
"precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
@ -164,12 +154,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[q2], %[sec3], 16 \n\t" "append %[q2], %[sec3], 16 \n\t"
"append %[q0], %[sec4], 16 \n\t" "append %[q0], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
[q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
/* transpose p7, p6, p5, p4 /* transpose p7, p6, p5, p4
original (when loaded from memory) original (when loaded from memory)
@ -186,7 +174,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
p6 p7_1 p6_1 p5_1 p4_1 p6 p7_1 p6_1 p5_1 p4_1
p7 p7_0 p6_0 p5_0 p4_0 p7 p7_0 p6_0 p5_0 p4_0
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
"precr.qb.ph %[prim2], %[p4], %[p5] \n\t" "precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
"precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
@ -202,12 +190,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[p5], %[sec3], 16 \n\t" "append %[p5], %[sec3], 16 \n\t"
"append %[p7], %[sec4], 16 \n\t" "append %[p7], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
[p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7), [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
/* transpose q4, q5, q6, q7 /* transpose q4, q5, q6, q7
original (when loaded from memory) original (when loaded from memory)
@ -224,7 +210,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
q5 q4_1 q5_1 q26_1 q7_1 q5 q4_1 q5_1 q26_1 q7_1
q4 q4_0 q5_0 q26_0 q7_0 q4 q4_0 q5_0 q26_0 q7_0
*/ */
__asm__ __volatile__ ( __asm__ __volatile__(
"precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
"precr.qb.ph %[prim2], %[q7], %[q6] \n\t" "precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
"precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
@ -240,71 +226,60 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"append %[q6], %[sec3], 16 \n\t" "append %[q6], %[sec3], 16 \n\t"
"append %[q4], %[sec4], 16 \n\t" "append %[q4], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4), [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
[q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4), [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4) :);
:
);
filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
p1, p0, p3, p2, q0, q1, q2, q3, p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
&hev, &mask, &flat);
flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
/* f0 */ /* f0 */
if (((flat2 == 0) && (flat == 0) && (mask != 0)) || if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
((flat2 != 0) && (flat == 0) && (mask != 0))) { ((flat2 != 0) && (flat == 0) && (mask != 0))) {
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
STORE_F0() STORE_F0()
} else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
(mask == 0xFFFFFFFF)) { (mask == 0xFFFFFFFF)) {
/* f2 */ /* f2 */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
PACK_LEFT_4TO7() PACK_LEFT_4TO7()
wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
&p3_l, &p2_l, &p1_l, &p0_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
&q0_l, &q1_l, &q2_l, &q3_l, &q6_l, &q7_l);
&q4_l, &q5_l, &q6_l, &q7_l);
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
PACK_RIGHT_4TO7() PACK_RIGHT_4TO7()
wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
&p3_r, &p2_r, &p1_r, &p0_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
&q0_r, &q1_r, &q2_r, &q3_r, &q6_r, &q7_r);
&q4_r, &q5_r, &q6_r, &q7_r);
STORE_F2() STORE_F2()
} else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
/* f1 */ /* f1 */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
STORE_F1() STORE_F1()
} else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
/* f0 + f1 */ /* f0 + f1 */
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
/* left 2 element operation */ /* left 2 element operation */
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
&q0_l, &q1_l, &q2_l, &q3_l);
/* right 2 element operation */ /* right 2 element operation */
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
&q0_r, &q1_r, &q2_r, &q3_r);
if (mask & flat & 0x000000FF) { if (mask & flat & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], -3(%[s4]) \n\t" "sb %[p2_r], -3(%[s4]) \n\t"
"sb %[p1_r], -2(%[s4]) \n\t" "sb %[p1_r], -2(%[s4]) \n\t"
"sb %[p0_r], -1(%[s4]) \n\t" "sb %[p0_r], -1(%[s4]) \n\t"
@ -313,25 +288,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r], +2(%[s4]) \n\t" "sb %[q2_r], +2(%[s4]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[s4] "r" (s4) [s4] "r"(s4));
);
} else if (mask & 0x000000FF) { } else if (mask & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s4]) \n\t" "sb %[p1_f0], -2(%[s4]) \n\t"
"sb %[p0_f0], -1(%[s4]) \n\t" "sb %[p0_f0], -1(%[s4]) \n\t"
"sb %[q0_f0], (%[s4]) \n\t" "sb %[q0_f0], (%[s4]) \n\t"
"sb %[q1_f0], +1(%[s4]) \n\t" "sb %[q1_f0], +1(%[s4]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s4] "r"(s4));
[s4] "r" (s4)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p2_r], %[p2_r], 16 \n\t"
"srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t"
"srl %[p0_r], %[p0_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t"
@ -343,15 +315,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
[q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0x0000FF00) { if (mask & flat & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r], -3(%[s3]) \n\t" "sb %[p2_r], -3(%[s3]) \n\t"
"sb %[p1_r], -2(%[s3]) \n\t" "sb %[p1_r], -2(%[s3]) \n\t"
"sb %[p0_r], -1(%[s3]) \n\t" "sb %[p0_r], -1(%[s3]) \n\t"
@ -360,37 +331,33 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r], +2(%[s3]) \n\t" "sb %[q2_r], +2(%[s3]) \n\t"
: :
: [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
[q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[s3] "r" (s3) [s3] "r"(s3));
);
} else if (mask & 0x0000FF00) { } else if (mask & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s3]) \n\t" "sb %[p1_f0], -2(%[s3]) \n\t"
"sb %[p0_f0], -1(%[s3]) \n\t" "sb %[p0_f0], -1(%[s3]) \n\t"
"sb %[q0_f0], (%[s3]) \n\t" "sb %[q0_f0], (%[s3]) \n\t"
"sb %[q1_f0], +1(%[s3]) \n\t" "sb %[q1_f0], +1(%[s3]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s3] "r"(s3));
[s3] "r" (s3)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0x00FF0000) { if (mask & flat & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], -3(%[s2]) \n\t" "sb %[p2_l], -3(%[s2]) \n\t"
"sb %[p1_l], -2(%[s2]) \n\t" "sb %[p1_l], -2(%[s2]) \n\t"
"sb %[p0_l], -1(%[s2]) \n\t" "sb %[p0_l], -1(%[s2]) \n\t"
@ -399,25 +366,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l], +2(%[s2]) \n\t" "sb %[q2_l], +2(%[s2]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[s2] "r" (s2) [s2] "r"(s2));
);
} else if (mask & 0x00FF0000) { } else if (mask & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s2]) \n\t" "sb %[p1_f0], -2(%[s2]) \n\t"
"sb %[p0_f0], -1(%[s2]) \n\t" "sb %[p0_f0], -1(%[s2]) \n\t"
"sb %[q0_f0], (%[s2]) \n\t" "sb %[q0_f0], (%[s2]) \n\t"
"sb %[q1_f0], +1(%[s2]) \n\t" "sb %[q1_f0], +1(%[s2]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s2] "r"(s2));
[s2] "r" (s2)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p2_l], %[p2_l], 16 \n\t"
"srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t"
"srl %[p0_l], %[p0_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t"
@ -429,15 +393,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
[q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & 0xFF000000) { if (mask & flat & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l], -3(%[s1]) \n\t" "sb %[p2_l], -3(%[s1]) \n\t"
"sb %[p1_l], -2(%[s1]) \n\t" "sb %[p1_l], -2(%[s1]) \n\t"
"sb %[p0_l], -1(%[s1]) \n\t" "sb %[p0_l], -1(%[s1]) \n\t"
@ -446,54 +409,44 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l], +2(%[s1]) \n\t" "sb %[q2_l], +2(%[s1]) \n\t"
: :
: [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
[q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[s1] "r" (s1) [s1] "r"(s1));
);
} else if (mask & 0xFF000000) { } else if (mask & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s1]) \n\t" "sb %[p1_f0], -2(%[s1]) \n\t"
"sb %[p0_f0], -1(%[s1]) \n\t" "sb %[p0_f0], -1(%[s1]) \n\t"
"sb %[q0_f0], (%[s1]) \n\t" "sb %[q0_f0], (%[s1]) \n\t"
"sb %[q1_f0], +1(%[s1]) \n\t" "sb %[q1_f0], +1(%[s1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s1] "r"(s1));
[s1] "r" (s1)
);
} }
} else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
/* f0+f1+f2 */ /* f0+f1+f2 */
filter1_dspr2(mask, hev, p1, p0, q0, q1, filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
&p1_f0, &p0_f0, &q0_f0, &q1_f0);
PACK_LEFT_0TO3() PACK_LEFT_0TO3()
mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
q0_l, q1_l, q2_l, q3_l, &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
&p2_l_f1, &p1_l_f1, &p0_l_f1,
&q0_l_f1, &q1_l_f1, &q2_l_f1);
PACK_RIGHT_0TO3() PACK_RIGHT_0TO3()
mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
q0_r, q1_r, q2_r, q3_r, &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
&p2_r_f1, &p1_r_f1, &p0_r_f1,
&q0_r_f1, &q1_r_f1, &q2_r_f1);
PACK_LEFT_4TO7() PACK_LEFT_4TO7()
wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
&p3_l, &p2_l, &p1_l, &p0_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
&q0_l, &q1_l, &q2_l, &q3_l, &q6_l, &q7_l);
&q4_l, &q5_l, &q6_l, &q7_l);
PACK_RIGHT_4TO7() PACK_RIGHT_4TO7()
wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
&p3_r, &p2_r, &p1_r, &p0_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
&q0_r, &q1_r, &q2_r, &q3_r, &q6_r, &q7_r);
&q4_r, &q5_r, &q6_r, &q7_r);
if (mask & flat & flat2 & 0x000000FF) { if (mask & flat & flat2 & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_r], -7(%[s4]) \n\t" "sb %[p6_r], -7(%[s4]) \n\t"
"sb %[p5_r], -6(%[s4]) \n\t" "sb %[p5_r], -6(%[s4]) \n\t"
"sb %[p4_r], -5(%[s4]) \n\t" "sb %[p4_r], -5(%[s4]) \n\t"
@ -503,13 +456,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_r], -1(%[s4]) \n\t" "sb %[p0_r], -1(%[s4]) \n\t"
: :
: [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
[p4_r] "r" (p4_r), [p3_r] "r" (p3_r), [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
[p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r"(p0_r), [s4] "r"(s4));
[p0_r] "r" (p0_r), [s4] "r" (s4)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_r], (%[s4]) \n\t" "sb %[q0_r], (%[s4]) \n\t"
"sb %[q1_r], +1(%[s4]) \n\t" "sb %[q1_r], +1(%[s4]) \n\t"
"sb %[q2_r], +2(%[s4]) \n\t" "sb %[q2_r], +2(%[s4]) \n\t"
@ -519,13 +470,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_r], +6(%[s4]) \n\t" "sb %[q6_r], +6(%[s4]) \n\t"
: :
: [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[q2_r] "r" (q2_r), [q3_r] "r" (q3_r), [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
[q4_r] "r" (q4_r), [q5_r] "r" (q5_r), [q6_r] "r"(q6_r), [s4] "r"(s4));
[q6_r] "r" (q6_r), [s4] "r" (s4)
);
} else if (mask & flat & 0x000000FF) { } else if (mask & flat & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r_f1], -3(%[s4]) \n\t" "sb %[p2_r_f1], -3(%[s4]) \n\t"
"sb %[p1_r_f1], -2(%[s4]) \n\t" "sb %[p1_r_f1], -2(%[s4]) \n\t"
"sb %[p0_r_f1], -1(%[s4]) \n\t" "sb %[p0_r_f1], -1(%[s4]) \n\t"
@ -534,26 +483,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r_f1], +2(%[s4]) \n\t" "sb %[q2_r_f1], +2(%[s4]) \n\t"
: :
: [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
[p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
[q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
[s4] "r" (s4)
);
} else if (mask & 0x000000FF) { } else if (mask & 0x000000FF) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s4]) \n\t" "sb %[p1_f0], -2(%[s4]) \n\t"
"sb %[p0_f0], -1(%[s4]) \n\t" "sb %[p0_f0], -1(%[s4]) \n\t"
"sb %[q0_f0], (%[s4]) \n\t" "sb %[q0_f0], (%[s4]) \n\t"
"sb %[q1_f0], +1(%[s4]) \n\t" "sb %[q1_f0], +1(%[s4]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s4] "r"(s4));
[s4] "r" (s4)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p6_r], %[p6_r], 16 \n\t" "srl %[p6_r], %[p6_r], 16 \n\t"
"srl %[p5_r], %[p5_r], 16 \n\t" "srl %[p5_r], %[p5_r], 16 \n\t"
"srl %[p4_r], %[p4_r], 16 \n\t" "srl %[p4_r], %[p4_r], 16 \n\t"
@ -569,17 +514,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q5_r], %[q5_r], 16 \n\t" "srl %[q5_r], %[q5_r], 16 \n\t"
"srl %[q6_r], %[q6_r], 16 \n\t" "srl %[q6_r], %[q6_r], 16 \n\t"
: [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
[q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r), [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
[q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r), [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
[q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r), [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
[p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
[p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), :);
[p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
:
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
"srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
"srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
@ -591,16 +533,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1), : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
[p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1), [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
[q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1), [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & flat2 & 0x0000FF00) { if (mask & flat & flat2 & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_r], -7(%[s3]) \n\t" "sb %[p6_r], -7(%[s3]) \n\t"
"sb %[p5_r], -6(%[s3]) \n\t" "sb %[p5_r], -6(%[s3]) \n\t"
"sb %[p4_r], -5(%[s3]) \n\t" "sb %[p4_r], -5(%[s3]) \n\t"
@ -610,12 +551,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_r], -1(%[s3]) \n\t" "sb %[p0_r], -1(%[s3]) \n\t"
: :
: [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
[p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
[p0_r] "r" (p0_r), [s3] "r" (s3) [p0_r] "r"(p0_r), [s3] "r"(s3));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_r], (%[s3]) \n\t" "sb %[q0_r], (%[s3]) \n\t"
"sb %[q1_r], +1(%[s3]) \n\t" "sb %[q1_r], +1(%[s3]) \n\t"
"sb %[q2_r], +2(%[s3]) \n\t" "sb %[q2_r], +2(%[s3]) \n\t"
@ -625,13 +565,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_r], +6(%[s3]) \n\t" "sb %[q6_r], +6(%[s3]) \n\t"
: :
: [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
[q2_r] "r" (q2_r), [q3_r] "r" (q3_r), [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
[q4_r] "r" (q4_r), [q5_r] "r" (q5_r), [q6_r] "r"(q6_r), [s3] "r"(s3));
[q6_r] "r" (q6_r), [s3] "r" (s3)
);
} else if (mask & flat & 0x0000FF00) { } else if (mask & flat & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_r_f1], -3(%[s3]) \n\t" "sb %[p2_r_f1], -3(%[s3]) \n\t"
"sb %[p1_r_f1], -2(%[s3]) \n\t" "sb %[p1_r_f1], -2(%[s3]) \n\t"
"sb %[p0_r_f1], -1(%[s3]) \n\t" "sb %[p0_r_f1], -1(%[s3]) \n\t"
@ -640,38 +578,33 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_r_f1], +2(%[s3]) \n\t" "sb %[q2_r_f1], +2(%[s3]) \n\t"
: :
: [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
[p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
[q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
[s3] "r" (s3)
);
} else if (mask & 0x0000FF00) { } else if (mask & 0x0000FF00) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s3]) \n\t" "sb %[p1_f0], -2(%[s3]) \n\t"
"sb %[p0_f0], -1(%[s3]) \n\t" "sb %[p0_f0], -1(%[s3]) \n\t"
"sb %[q0_f0], (%[s3]) \n\t" "sb %[q0_f0], (%[s3]) \n\t"
"sb %[q1_f0], +1(%[s3]) \n\t" "sb %[q1_f0], +1(%[s3]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s3] "r"(s3));
[s3] "r" (s3)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p1_f0], %[p1_f0], 8 \n\t"
"srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t"
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & flat2 & 0x00FF0000) { if (mask & flat & flat2 & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_l], -7(%[s2]) \n\t" "sb %[p6_l], -7(%[s2]) \n\t"
"sb %[p5_l], -6(%[s2]) \n\t" "sb %[p5_l], -6(%[s2]) \n\t"
"sb %[p4_l], -5(%[s2]) \n\t" "sb %[p4_l], -5(%[s2]) \n\t"
@ -681,12 +614,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_l], -1(%[s2]) \n\t" "sb %[p0_l], -1(%[s2]) \n\t"
: :
: [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
[p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
[p0_l] "r" (p0_l), [s2] "r" (s2) [p0_l] "r"(p0_l), [s2] "r"(s2));
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_l], (%[s2]) \n\t" "sb %[q0_l], (%[s2]) \n\t"
"sb %[q1_l], +1(%[s2]) \n\t" "sb %[q1_l], +1(%[s2]) \n\t"
"sb %[q2_l], +2(%[s2]) \n\t" "sb %[q2_l], +2(%[s2]) \n\t"
@ -696,12 +628,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_l], +6(%[s2]) \n\t" "sb %[q6_l], +6(%[s2]) \n\t"
: :
: [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
[q6_l] "r" (q6_l), [s2] "r" (s2) [q6_l] "r"(q6_l), [s2] "r"(s2));
);
} else if (mask & flat & 0x00FF0000) { } else if (mask & flat & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l_f1], -3(%[s2]) \n\t" "sb %[p2_l_f1], -3(%[s2]) \n\t"
"sb %[p1_l_f1], -2(%[s2]) \n\t" "sb %[p1_l_f1], -2(%[s2]) \n\t"
"sb %[p0_l_f1], -1(%[s2]) \n\t" "sb %[p0_l_f1], -1(%[s2]) \n\t"
@ -710,26 +641,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l_f1], +2(%[s2]) \n\t" "sb %[q2_l_f1], +2(%[s2]) \n\t"
: :
: [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
[p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
[q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
[s2] "r" (s2)
);
} else if (mask & 0x00FF0000) { } else if (mask & 0x00FF0000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s2]) \n\t" "sb %[p1_f0], -2(%[s2]) \n\t"
"sb %[p0_f0], -1(%[s2]) \n\t" "sb %[p0_f0], -1(%[s2]) \n\t"
"sb %[q0_f0], (%[s2]) \n\t" "sb %[q0_f0], (%[s2]) \n\t"
"sb %[q1_f0], +1(%[s2]) \n\t" "sb %[q1_f0], +1(%[s2]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s2] "r"(s2));
[s2] "r" (s2)
);
} }
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p6_l], %[p6_l], 16 \n\t" "srl %[p6_l], %[p6_l], 16 \n\t"
"srl %[p5_l], %[p5_l], 16 \n\t" "srl %[p5_l], %[p5_l], 16 \n\t"
"srl %[p4_l], %[p4_l], 16 \n\t" "srl %[p4_l], %[p4_l], 16 \n\t"
@ -745,15 +672,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q5_l], %[q5_l], 16 \n\t" "srl %[q5_l], %[q5_l], 16 \n\t"
"srl %[q6_l], %[q6_l], 16 \n\t" "srl %[q6_l], %[q6_l], 16 \n\t"
: [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
[q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l), [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
[q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l), [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
[p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l), [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
[p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l) [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
: :);
);
__asm__ __volatile__ ( __asm__ __volatile__(
"srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
"srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
"srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
@ -765,16 +691,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t"
"srl %[q1_f0], %[q1_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t"
: [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1), : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
[p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1), [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
[q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1), [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
[p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
[q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) [q1_f0] "+r"(q1_f0)
: :);
);
if (mask & flat & flat2 & 0xFF000000) { if (mask & flat & flat2 & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p6_l], -7(%[s1]) \n\t" "sb %[p6_l], -7(%[s1]) \n\t"
"sb %[p5_l], -6(%[s1]) \n\t" "sb %[p5_l], -6(%[s1]) \n\t"
"sb %[p4_l], -5(%[s1]) \n\t" "sb %[p4_l], -5(%[s1]) \n\t"
@ -784,13 +709,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[p0_l], -1(%[s1]) \n\t" "sb %[p0_l], -1(%[s1]) \n\t"
: :
: [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
[p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
[p0_l] "r" (p0_l), [p0_l] "r"(p0_l), [s1] "r"(s1));
[s1] "r" (s1)
);
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[q0_l], (%[s1]) \n\t" "sb %[q0_l], (%[s1]) \n\t"
"sb %[q1_l], 1(%[s1]) \n\t" "sb %[q1_l], 1(%[s1]) \n\t"
"sb %[q2_l], 2(%[s1]) \n\t" "sb %[q2_l], 2(%[s1]) \n\t"
@ -800,13 +723,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q6_l], 6(%[s1]) \n\t" "sb %[q6_l], 6(%[s1]) \n\t"
: :
: [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
[q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
[q6_l] "r" (q6_l), [q6_l] "r"(q6_l), [s1] "r"(s1));
[s1] "r" (s1)
);
} else if (mask & flat & 0xFF000000) { } else if (mask & flat & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p2_l_f1], -3(%[s1]) \n\t" "sb %[p2_l_f1], -3(%[s1]) \n\t"
"sb %[p1_l_f1], -2(%[s1]) \n\t" "sb %[p1_l_f1], -2(%[s1]) \n\t"
"sb %[p0_l_f1], -1(%[s1]) \n\t" "sb %[p0_l_f1], -1(%[s1]) \n\t"
@ -815,23 +736,19 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
"sb %[q2_l_f1], +2(%[s1]) \n\t" "sb %[q2_l_f1], +2(%[s1]) \n\t"
: :
: [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
[p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
[q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
[s1] "r" (s1)
);
} else if (mask & 0xFF000000) { } else if (mask & 0xFF000000) {
__asm__ __volatile__ ( __asm__ __volatile__(
"sb %[p1_f0], -2(%[s1]) \n\t" "sb %[p1_f0], -2(%[s1]) \n\t"
"sb %[p0_f0], -1(%[s1]) \n\t" "sb %[p0_f0], -1(%[s1]) \n\t"
"sb %[q0_f0], (%[s1]) \n\t" "sb %[q0_f0], (%[s1]) \n\t"
"sb %[q1_f0], +1(%[s1]) \n\t" "sb %[q1_f0], +1(%[s1]) \n\t"
: :
: [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
[q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), [q1_f0] "r"(q1_f0), [s1] "r"(s1));
[s1] "r" (s1)
);
} }
} }
} }

View File

@ -14,7 +14,8 @@
#include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/macros_msa.h"
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
p1_out, p0_out, q0_out, q1_out) { \ p1_out, p0_out, q0_out, q1_out) \
{ \
v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
v8i16 q0_sub_p0_r, filt_r, cnst3h; \ v8i16 q0_sub_p0_r, filt_r, cnst3h; \
@ -61,10 +62,11 @@
q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, filt); \ p1_m = __msa_adds_s_b(p1_m, filt); \
p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
} }
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
p1_out, p0_out, q0_out, q1_out) { \ p1_out, p0_out, q0_out, q1_out) \
{ \
v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
@ -118,9 +120,10 @@
q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, filt); \ p1_m = __msa_adds_s_b(p1_m, filt); \
p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
} }
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \ #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
{ \
v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
v16u8 zero_in = { 0 }; \ v16u8 zero_in = { 0 }; \
\ \
@ -138,10 +141,11 @@
flat_out = (tmp < (v16u8)flat_out); \ flat_out = (tmp < (v16u8)flat_out); \
flat_out = __msa_xori_b(flat_out, 0xff); \ flat_out = __msa_xori_b(flat_out, 0xff); \
flat_out = flat_out & (mask); \ flat_out = flat_out & (mask); \
} }
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
q5_in, q6_in, q7_in, flat_in, flat2_out) { \ q6_in, q7_in, flat_in, flat2_out) \
{ \
v16u8 tmp, zero_in = { 0 }; \ v16u8 tmp, zero_in = { 0 }; \
v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
@ -167,12 +171,12 @@
flat2_out = (tmp < (v16u8)flat2_out); \ flat2_out = (tmp < (v16u8)flat2_out); \
flat2_out = __msa_xori_b(flat2_out, 0xff); \ flat2_out = __msa_xori_b(flat2_out, 0xff); \
flat2_out = flat2_out & flat_in; \ flat2_out = flat2_out & flat_in; \
} }
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \ #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
q0_in, q1_in, q2_in, q3_in, \ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
p2_filt8_out, p1_filt8_out, p0_filt8_out, \ q1_filt8_out, q2_filt8_out) \
q0_filt8_out, q1_filt8_out, q2_filt8_out) { \ { \
v8u16 tmp0, tmp1, tmp2; \ v8u16 tmp0, tmp1, tmp2; \
\ \
tmp2 = p2_in + p1_in + p0_in; \ tmp2 = p2_in + p1_in + p0_in; \
@ -205,12 +209,12 @@
tmp0 = q1_in + q3_in; \ tmp0 = q1_in + q3_in; \
tmp1 = tmp0 + tmp1; \ tmp1 = tmp0 + tmp1; \
q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
} }
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
q0_in, q1_in, q2_in, q3_in, \ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
limit_in, b_limit_in, thresh_in, \ flat_out) \
hev_out, mask_out, flat_out) { \ { \
v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
\ \
@ -242,5 +246,5 @@
\ \
mask_out = limit_in < (v16u8)mask_out; \ mask_out = limit_in < (v16u8)mask_out; \
mask_out = __msa_xori_b(mask_out, 0xff); \ mask_out = __msa_xori_b(mask_out, 0xff); \
} }
#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */ #endif /* VPX_DSP_LOOPFILTER_MSA_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -11,12 +11,13 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/macros_msa.h"
#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) { \ #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
{ \
out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
} }
#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
@ -58,8 +59,8 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
ref += (4 * ref_stride); ref += (4 * ref_stride);
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
src0, src1, ref0, ref1); ref0, ref1);
sad += SAD_UB2_UH(src0, src1, ref0, ref1); sad += SAD_UB2_UH(src0, src1, ref0, ref1);
} }
@ -214,8 +215,8 @@ static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride); src += (4 * src_stride);
LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
ref += (4 * ref_stride); ref += (4 * ref_stride);
PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
src0, src1, ref0, ref1); ref0, ref1);
sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
@ -473,8 +474,8 @@ static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride); src += (4 * src_stride);
LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
ref += (4 * ref_stride); ref += (4 * ref_stride);
PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
src0, src1, ref0, ref1); ref0, ref1);
sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
@ -793,9 +794,9 @@ static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
} }
static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t * const aref_ptr[], const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t ref_stride, int32_t height,
int32_t height, uint32_t *sad_array) { uint32_t *sad_array) {
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
int32_t ht_cnt; int32_t ht_cnt;
uint32_t src0, src1, src2, src3; uint32_t src0, src1, src2, src3;
@ -854,9 +855,9 @@ static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
} }
static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t * const aref_ptr[], const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t ref_stride, int32_t height,
int32_t height, uint32_t *sad_array) { uint32_t *sad_array) {
int32_t ht_cnt; int32_t ht_cnt;
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
v16u8 src0, src1, src2, src3; v16u8 src0, src1, src2, src3;
@ -905,9 +906,9 @@ static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
} }
static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t * const aref_ptr[], const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t ref_stride, int32_t height,
int32_t height, uint32_t *sad_array) { uint32_t *sad_array) {
int32_t ht_cnt; int32_t ht_cnt;
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
v16u8 src, ref0, ref1, ref2, ref3, diff; v16u8 src, ref0, ref1, ref2, ref3, diff;
@ -970,9 +971,9 @@ static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
} }
static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
const uint8_t * const aref_ptr[], const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t ref_stride, int32_t height,
int32_t height, uint32_t *sad_array) { uint32_t *sad_array) {
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
int32_t ht_cnt; int32_t ht_cnt;
v16u8 src0, src1, ref0, ref1; v16u8 src0, src1, ref0, ref1;
@ -1014,9 +1015,9 @@ static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
} }
static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
const uint8_t * const aref_ptr[], const uint8_t *const aref_ptr[],
int32_t ref_stride, int32_t ref_stride, int32_t height,
int32_t height, uint32_t *sad_array) { uint32_t *sad_array) {
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
int32_t ht_cnt; int32_t ht_cnt;
v16u8 src0, src1, src2, src3; v16u8 src0, src1, src2, src3;
@ -1114,8 +1115,8 @@ static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
ref += (4 * ref_stride); ref += (4 * ref_stride);
LD_UB2(sec_pred, 16, pred0, pred1); LD_UB2(sec_pred, 16, pred0, pred1);
sec_pred += 32; sec_pred += 32;
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
src0, src1, ref0, ref1); ref0, ref1);
AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
sad += SAD_UB2_UH(src0, src1, diff0, diff1); sad += SAD_UB2_UH(src0, src1, diff0, diff1);
} }
@ -1213,8 +1214,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride; ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64; sec_pred += 64;
AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
comp0, comp1, comp2, comp3); comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
@ -1224,8 +1225,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride; ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64; sec_pred += 64;
AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
comp0, comp1, comp2, comp3); comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
@ -1235,8 +1236,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride; ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64; sec_pred += 64;
AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
comp0, comp1, comp2, comp3); comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
@ -1246,8 +1247,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
ref += ref_stride; ref += ref_stride;
LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
sec_pred += 64; sec_pred += 64;
AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
comp0, comp1, comp2, comp3); comp1, comp2, comp3);
sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
} }
@ -1259,179 +1260,179 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
} }
#define VPX_SAD_4xHEIGHT_MSA(height) \ #define VPX_SAD_4xHEIGHT_MSA(height) \
uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride) { \ const uint8_t *ref, int32_t ref_stride) { \
return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
} }
#define VPX_SAD_8xHEIGHT_MSA(height) \ #define VPX_SAD_8xHEIGHT_MSA(height) \
uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride) { \ const uint8_t *ref, int32_t ref_stride) { \
return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
} }
#define VPX_SAD_16xHEIGHT_MSA(height) \ #define VPX_SAD_16xHEIGHT_MSA(height) \
uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride) { \ const uint8_t *ref, int32_t ref_stride) { \
return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
} }
#define VPX_SAD_32xHEIGHT_MSA(height) \ #define VPX_SAD_32xHEIGHT_MSA(height) \
uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride) { \ const uint8_t *ref, int32_t ref_stride) { \
return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
} }
#define VPX_SAD_64xHEIGHT_MSA(height) \ #define VPX_SAD_64xHEIGHT_MSA(height) \
uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride) { \ const uint8_t *ref, int32_t ref_stride) { \
return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
} }
#define VPX_SAD_4xHEIGHTx3_MSA(height) \ #define VPX_SAD_4xHEIGHTx3_MSA(height) \
void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_8xHEIGHTx3_MSA(height) \ #define VPX_SAD_8xHEIGHTx3_MSA(height) \
void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_16xHEIGHTx3_MSA(height) \ #define VPX_SAD_16xHEIGHTx3_MSA(height) \
void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_32xHEIGHTx3_MSA(height) \ #define VPX_SAD_32xHEIGHTx3_MSA(height) \
void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_64xHEIGHTx3_MSA(height) \ #define VPX_SAD_64xHEIGHTx3_MSA(height) \
void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_4xHEIGHTx8_MSA(height) \ #define VPX_SAD_4xHEIGHTx8_MSA(height) \
void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_8xHEIGHTx8_MSA(height) \ #define VPX_SAD_8xHEIGHTx8_MSA(height) \
void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_16xHEIGHTx8_MSA(height) \ #define VPX_SAD_16xHEIGHTx8_MSA(height) \
void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_32xHEIGHTx8_MSA(height) \ #define VPX_SAD_32xHEIGHTx8_MSA(height) \
void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_64xHEIGHTx8_MSA(height) \ #define VPX_SAD_64xHEIGHTx8_MSA(height) \
void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
uint32_t *sads) { \ uint32_t *sads) { \
sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
} }
#define VPX_SAD_4xHEIGHTx4D_MSA(height) \ #define VPX_SAD_4xHEIGHTx4D_MSA(height) \
void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \ const uint8_t *const refs[], \
int32_t ref_stride, uint32_t *sads) { \ int32_t ref_stride, uint32_t *sads) { \
sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
} }
#define VPX_SAD_8xHEIGHTx4D_MSA(height) \ #define VPX_SAD_8xHEIGHTx4D_MSA(height) \
void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \ const uint8_t *const refs[], \
int32_t ref_stride, uint32_t *sads) { \ int32_t ref_stride, uint32_t *sads) { \
sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
} }
#define VPX_SAD_16xHEIGHTx4D_MSA(height) \ #define VPX_SAD_16xHEIGHTx4D_MSA(height) \
void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \ const uint8_t *const refs[], \
int32_t ref_stride, uint32_t *sads) { \ int32_t ref_stride, uint32_t *sads) { \
sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
} }
#define VPX_SAD_32xHEIGHTx4D_MSA(height) \ #define VPX_SAD_32xHEIGHTx4D_MSA(height) \
void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \ const uint8_t *const refs[], \
int32_t ref_stride, uint32_t *sads) { \ int32_t ref_stride, uint32_t *sads) { \
sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
} }
#define VPX_SAD_64xHEIGHTx4D_MSA(height) \ #define VPX_SAD_64xHEIGHTx4D_MSA(height) \
void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *const refs[], \ const uint8_t *const refs[], \
int32_t ref_stride, uint32_t *sads) { \ int32_t ref_stride, uint32_t *sads) { \
sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
} }
#define VPX_AVGSAD_4xHEIGHT_MSA(height) \ #define VPX_AVGSAD_4xHEIGHT_MSA(height) \
uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
const uint8_t *second_pred) { \ const uint8_t *second_pred) { \
return avgsad_4width_msa(src, src_stride, ref, ref_stride, \ return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
height, second_pred); \ second_pred); \
} }
#define VPX_AVGSAD_8xHEIGHT_MSA(height) \ #define VPX_AVGSAD_8xHEIGHT_MSA(height) \
uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *ref, int32_t ref_stride, \
const uint8_t *second_pred) { \ const uint8_t *second_pred) { \
return avgsad_8width_msa(src, src_stride, ref, ref_stride, \ return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
height, second_pred); \ second_pred); \
} }
#define VPX_AVGSAD_16xHEIGHT_MSA(height) \ #define VPX_AVGSAD_16xHEIGHT_MSA(height) \
uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad16x##height##_avg_msa( \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
const uint8_t *second_pred) { \ int32_t ref_stride, const uint8_t *second_pred) { \
return avgsad_16width_msa(src, src_stride, ref, ref_stride, \ return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
height, second_pred); \ second_pred); \
} }
#define VPX_AVGSAD_32xHEIGHT_MSA(height) \ #define VPX_AVGSAD_32xHEIGHT_MSA(height) \
uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad32x##height##_avg_msa( \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
const uint8_t *second_pred) { \ int32_t ref_stride, const uint8_t *second_pred) { \
return avgsad_32width_msa(src, src_stride, ref, ref_stride, \ return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
height, second_pred); \ second_pred); \
} }
#define VPX_AVGSAD_64xHEIGHT_MSA(height) \ #define VPX_AVGSAD_64xHEIGHT_MSA(height) \
uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ uint32_t vpx_sad64x##height##_avg_msa( \
const uint8_t *ref, int32_t ref_stride, \ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
const uint8_t *second_pred) { \ int32_t ref_stride, const uint8_t *second_pred) { \
return avgsad_64width_msa(src, src_stride, ref, ref_stride, \ return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
height, second_pred); \ second_pred); \
} }
// 64x64 // 64x64
VPX_SAD_64xHEIGHT_MSA(64); VPX_SAD_64xHEIGHT_MSA(64);

File diff suppressed because it is too large Load Diff

View File

@ -68,8 +68,8 @@ static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
LD_SB8(pred, pred_stride, LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7); pred7);
pred += (8 * pred_stride); pred += (8 * pred_stride);
ILVRL_B2_UB(src0, pred0, src_l0, src_l1); ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
@ -226,31 +226,31 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
} }
} }
void vpx_subtract_block_msa(int32_t rows, int32_t cols, void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
int16_t *diff_ptr, ptrdiff_t diff_stride, ptrdiff_t diff_stride, const uint8_t *src_ptr,
const uint8_t *src_ptr, ptrdiff_t src_stride, ptrdiff_t src_stride, const uint8_t *pred_ptr,
const uint8_t *pred_ptr, ptrdiff_t pred_stride) { ptrdiff_t pred_stride) {
if (rows == cols) { if (rows == cols) {
switch (rows) { switch (rows) {
case 4: case 4:
sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
diff_ptr, diff_stride); diff_stride);
break; break;
case 8: case 8:
sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
diff_ptr, diff_stride); diff_stride);
break; break;
case 16: case 16:
sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
diff_ptr, diff_stride); diff_stride);
break; break;
case 32: case 32:
sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
diff_ptr, diff_stride); diff_stride);
break; break;
case 64: case 64:
sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
diff_ptr, diff_stride); diff_stride);
break; break;
default: default:
vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,

View File

@ -13,7 +13,8 @@
#include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/macros_msa.h"
#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ #define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
{ \
v8i16 k0_m = __msa_fill_h(cnst0); \ v8i16 k0_m = __msa_fill_h(cnst0); \
v4i32 s0_m, s1_m, s2_m, s3_m; \ v4i32 s0_m, s1_m, s2_m, s3_m; \
\ \
@ -29,26 +30,28 @@
DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
} }
#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ #define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \
dst0, dst1, dst2, dst3) { \ dst1, dst2, dst3) \
{ \
v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
\ \
DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \
tp0_m, tp2_m, tp3_m, tp4_m); \ tp4_m); \
DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \
tp5_m, tp6_m, tp7_m, tp8_m); \ tp8_m); \
BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
dst0, dst1, dst2, dst3); \ dst1, dst2, dst3); \
} }
#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \ #define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \
({ \
v8i16 dst_m; \ v8i16 dst_m; \
v4i32 tp0_m, tp1_m; \ v4i32 tp0_m, tp1_m; \
\ \
@ -57,37 +60,37 @@
dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
\ \
dst_m; \ dst_m; \
}) })
#define MADD_SHORT(m0, m1, c0, c1, res0, res1) { \ #define MADD_SHORT(m0, m1, c0, c1, res0, res1) \
{ \
v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
v8i16 madd_s0_m, madd_s1_m; \ v8i16 madd_s0_m, madd_s1_m; \
\ \
ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ madd0_m, madd1_m, madd2_m, madd3_m); \
SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
} }
#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ #define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \
out0, out1, out2, out3) { \ out2, out3) \
{ \
v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
\ \
ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
m4_m, m5_m, tmp3_m, tmp2_m); \
SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \
m4_m, m5_m, tmp3_m, tmp2_m); \
SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
} }
#endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ #endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_

View File

@ -11,16 +11,18 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/macros_msa.h"
#define CALC_MSE_B(src, ref, var) { \ #define CALC_MSE_B(src, ref, var) \
{ \
v16u8 src_l0_m, src_l1_m; \ v16u8 src_l0_m, src_l1_m; \
v8i16 res_l0_m, res_l1_m; \ v8i16 res_l0_m, res_l1_m; \
\ \
ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
} }
#define CALC_MSE_AVG_B(src, ref, var, sub) { \ #define CALC_MSE_AVG_B(src, ref, var, sub) \
{ \
v16u8 src_l0_m, src_l1_m; \ v16u8 src_l0_m, src_l1_m; \
v8i16 res_l0_m, res_l1_m; \ v8i16 res_l0_m, res_l1_m; \
\ \
@ -29,10 +31,9 @@
DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
\ \
sub += res_l0_m + res_l1_m; \ sub += res_l0_m + res_l1_m; \
} }
#define VARIANCE_WxH(sse, diff, shift) \ #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
sse - (((uint32_t)diff * diff) >> shift)
#define VARIANCE_LARGE_WxH(sse, diff, shift) \ #define VARIANCE_LARGE_WxH(sse, diff, shift) \
sse - (((int64_t)diff * diff) >> shift) sse - (((int64_t)diff * diff) >> shift)
@ -80,8 +81,8 @@ static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
ref_ptr += (4 * ref_stride); ref_ptr += (4 * ref_stride);
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
src0, src1, ref0, ref1); ref0, ref1);
CALC_MSE_AVG_B(src0, ref0, var, avg); CALC_MSE_AVG_B(src0, ref0, var, avg);
CALC_MSE_AVG_B(src1, ref1, var, avg); CALC_MSE_AVG_B(src1, ref1, var, avg);
} }
@ -370,8 +371,8 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
ref_ptr += (4 * ref_stride); ref_ptr += (4 * ref_stride);
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
src0, src1, ref0, ref1); ref0, ref1);
CALC_MSE_B(src0, ref0, var); CALC_MSE_B(src0, ref0, var);
CALC_MSE_B(src1, ref1, var); CALC_MSE_B(src1, ref1, var);
} }
@ -527,18 +528,16 @@ uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \ #define VPX_VARIANCE_WDXHT_MSA(wd, ht) \
uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src, \ uint32_t vpx_variance##wd##x##ht##_msa( \
int32_t src_stride, \ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
const uint8_t *ref, \ int32_t ref_stride, uint32_t *sse) { \
int32_t ref_stride, \
uint32_t *sse) { \
int32_t diff; \ int32_t diff; \
\ \
*sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, \ *sse = \
ht, &diff); \ sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
\ \
return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
} }
VPX_VARIANCE_WDXHT_MSA(4, 4); VPX_VARIANCE_WDXHT_MSA(4, 4);
VPX_VARIANCE_WDXHT_MSA(4, 8); VPX_VARIANCE_WDXHT_MSA(4, 8);
@ -585,8 +584,7 @@ uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
} }
uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride, uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride, const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
uint32_t *sse) {
*sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
return *sse; return *sse;
@ -617,17 +615,15 @@ uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
} }
void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride, void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride, const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
uint32_t *sse, int32_t *sum) { int32_t *sum) {
*sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
} }
void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride, void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride, const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
uint32_t *sse, int32_t *sum) { int32_t *sum) {
*sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
} }
uint32_t vpx_get_mb_ss_msa(const int16_t *src) { uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
return get_mb_ss_msa(src);
}

View File

@ -13,8 +13,7 @@
#include "vpx_dsp/mips/vpx_convolve_msa.h" #include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@ -48,8 +47,7 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
} }
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@ -92,10 +90,8 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
} }
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
if (4 == height) { if (4 == height) {
common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@ -105,10 +101,8 @@ static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
} }
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
int32_t loop_cnt; int32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@ -136,18 +130,16 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
SAT_SH4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out0, out1, out2, out3, 7);
CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} }
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
int32_t loop_cnt; int32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, dst0, dst1; v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
@ -199,11 +191,9 @@ static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
} }
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 dst1, dst2, mask0, mask1, mask2, mask3; v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
@ -256,11 +246,9 @@ static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
} }
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt, cnt; uint32_t loop_cnt, cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 dst1, dst2, mask0, mask1, mask2, mask3; v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
@ -318,8 +306,7 @@ static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
@ -344,8 +331,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
@ -378,10 +364,8 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
if (4 == height) { if (4 == height) {
common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@ -391,8 +375,7 @@ static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
@ -412,16 +395,13 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
dst, dst_stride); dst_stride);
} }
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_8x8mult_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter, int32_t height) {
int32_t dst_stride,
int8_t *filter,
int32_t height) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, filt; v8u16 vec0, vec1, vec2, vec3, filt;
@ -442,8 +422,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@ -452,8 +432,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
if (16 == height) { if (16 == height) {
@ -467,8 +447,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@ -477,16 +457,14 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
dst, dst_stride); dst_stride);
} }
} }
static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
if (4 == height) { if (4 == height) {
common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
@ -497,11 +475,9 @@ static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
@ -566,11 +542,9 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
@ -617,11 +591,9 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
} }
static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
@ -662,8 +634,8 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
int8_t cnt, filt_hor[8]; int8_t cnt, filt_hor[8];
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
@ -676,67 +648,55 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_x)[0] == 0) { if (((const int32_t *)filter_x)[0] == 0) {
switch (w) { switch (w) {
case 4: case 4:
common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3], h);
&filt_hor[3], h);
break; break;
case 8: case 8:
common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3], h);
&filt_hor[3], h);
break; break;
case 16: case 16:
common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3], h);
&filt_hor[3], h);
break; break;
case 32: case 32:
common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3], h);
&filt_hor[3], h);
break; break;
case 64: case 64:
common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3], h);
&filt_hor[3], h);
break; break;
default: default:
vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} else { } else {
switch (w) { switch (w) {
case 4: case 4:
common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, h);
filt_hor, h);
break; break;
case 8: case 8:
common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, h);
filt_hor, h);
break; break;
case 16: case 16:
common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, h);
filt_hor, h);
break; break;
case 32: case 32:
common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, h);
filt_hor, h);
break; break;
case 64: case 64:
common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, h);
filt_hor, h);
break; break;
default: default:
vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} }

View File

@ -12,13 +12,9 @@
#include "./vpx_dsp_rtcd.h" #include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/vpx_convolve_msa.h" #include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
@ -64,15 +60,15 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
src += (4 * src_stride); src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
@ -94,13 +90,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
} }
} }
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
@ -154,20 +146,20 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
@ -180,8 +172,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
hz_out6 = hz_out10; hz_out6 = hz_out10;
@ -194,13 +186,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
} }
} }
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
int32_t multiple8_cnt; int32_t multiple8_cnt;
for (multiple8_cnt = 2; multiple8_cnt--;) { for (multiple8_cnt = 2; multiple8_cnt--;) {
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@ -210,13 +198,9 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
} }
} }
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
int32_t multiple8_cnt; int32_t multiple8_cnt;
for (multiple8_cnt = 4; multiple8_cnt--;) { for (multiple8_cnt = 4; multiple8_cnt--;) {
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@ -226,13 +210,9 @@ static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
} }
} }
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
int32_t multiple8_cnt; int32_t multiple8_cnt;
for (multiple8_cnt = 8; multiple8_cnt--;) { for (multiple8_cnt = 8; multiple8_cnt--;) {
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@ -242,12 +222,9 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
} }
} }
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, mask; v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, vec0, vec1; v16u8 filt_hz, filt_vt, vec0, vec1;
v16u8 dst0, dst1, dst2, dst3, res0, res1; v16u8 dst0, dst1, dst2, dst3, res0, res1;
@ -280,12 +257,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
} }
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@ -316,29 +290,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
dst4, dst6); dst6);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
tmp0, tmp1, tmp2, tmp3); tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
res2, res3); res3);
AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
res2, res3); res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
} }
static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
if (4 == height) { if (4 == height) {
common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
filter_horiz, filter_vert); filter_horiz, filter_vert);
@ -348,12 +318,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
} }
} }
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, mask; v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
@ -390,17 +357,13 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
tmp3 = __msa_dotp_u_h(vec3, filt_vt); tmp3 = __msa_dotp_u_h(vec3, filt_vt);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
dst, dst_stride); dst_stride);
} }
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, mask; v16i8 src0, src1, src2, src3, src4, mask;
v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
@ -445,36 +408,27 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} }
static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
if (4 == height) { if (4 == height) {
common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
filter_horiz, filter_vert); filter_horiz, filter_vert);
} else { } else {
common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
filter_horiz, filter_vert, src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
height);
} }
} }
static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
@ -536,13 +490,9 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
} }
} }
static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
int32_t multiple8_cnt; int32_t multiple8_cnt;
for (multiple8_cnt = 2; multiple8_cnt--;) { for (multiple8_cnt = 2; multiple8_cnt--;) {
common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
@ -552,13 +502,9 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
} }
} }
static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
int32_t dst_stride,
int8_t *filter_horiz,
int8_t *filter_vert,
int32_t height) {
int32_t multiple8_cnt; int32_t multiple8_cnt;
for (multiple8_cnt = 4; multiple8_cnt--;) { for (multiple8_cnt = 4; multiple8_cnt--;) {
common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
@ -571,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
int8_t cnt, filt_hor[8], filt_ver[8]; int8_t cnt, filt_hor[8], filt_ver[8];
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
@ -589,72 +535,69 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
((const int32_t *)filter_y)[0] == 0) { ((const int32_t *)filter_y)[0] == 0) {
switch (w) { switch (w) {
case 4: case 4:
common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], h); &filt_ver[3], h);
break; break;
case 8: case 8:
common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], h); &filt_ver[3], h);
break; break;
case 16: case 16:
common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride,
&filt_hor[3], &filt_ver[3], h); &filt_hor[3], &filt_ver[3], h);
break; break;
case 32: case 32:
common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride,
&filt_hor[3], &filt_ver[3], h); &filt_hor[3], &filt_ver[3], h);
break; break;
case 64: case 64:
common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride,
&filt_hor[3], &filt_ver[3], h); &filt_hor[3], &filt_ver[3], h);
break; break;
default: default:
vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} else if (((const int32_t *)filter_x)[0] == 0 || } else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) { ((const int32_t *)filter_y)[0] == 0) {
vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_x, x_step_q4, filter_y, y_step_q4, filter_y, y_step_q4, w, h);
w, h);
} else { } else {
switch (w) { switch (w) {
case 4: case 4:
common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor,
filt_hor, filt_ver, h); filt_ver, h);
break; break;
case 8: case 8:
common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor,
filt_hor, filt_ver, h); filt_ver, h);
break; break;
case 16: case 16:
common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor,
filt_hor, filt_ver, h); filt_ver, h);
break; break;
case 32: case 32:
common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor,
filt_hor, filt_ver, h); filt_ver, h);
break; break;
case 64: case 64:
common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor,
filt_hor, filt_ver, h); filt_ver, h);
break; break;
default: default:
vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} }

View File

@ -13,10 +13,8 @@
#include "vpx_dsp/mips/vpx_convolve_msa.h" #include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@ -73,10 +71,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
} }
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@ -106,18 +102,18 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
XORI_B4_128_SB(src7, src8, src9, src10); XORI_B4_128_SB(src7, src8, src9, src10);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r); src87_r, src98_r, src109_r);
out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
filt1, filt2, filt3); filt2, filt3);
out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
filt1, filt2, filt3); filt2, filt3);
out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
filt1, filt2, filt3); filt2, filt3);
out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
filt1, filt2, filt3); filt1, filt2, filt3);
SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
SAT_SH4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out0, out1, out2, out3, 7);
CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
src10_r = src54_r; src10_r = src54_r;
@ -130,13 +126,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
} }
} }
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, static void common_vt_8t_and_aver_dst_16w_mult_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter, int32_t height, int32_t width) {
int32_t dst_stride,
int8_t *filter,
int32_t height,
int32_t width) {
const uint8_t *src_tmp; const uint8_t *src_tmp;
uint8_t *dst_tmp; uint8_t *dst_tmp;
uint32_t loop_cnt, cnt; uint32_t loop_cnt, cnt;
@ -227,38 +219,31 @@ static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
} }
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, 16); filter, height, 16);
} }
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, 32); filter, height, 32);
} }
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, 64); filter, height, 64);
} }
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, src4; v16i8 src0, src1, src2, src3, src4;
@ -292,8 +277,7 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@ -311,15 +295,15 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
src8 = LD_SB(src); src8 = LD_SB(src);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
dst2, dst3); dst3);
ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
src32_r, src43_r); src32_r, src43_r);
ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
src76_r, src87_r); src76_r, src87_r);
ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
src87_r, src76_r, src2110, src4332, src6554, src8776); src76_r, src2110, src4332, src6554, src8776);
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3); tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@ -331,10 +315,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
if (4 == height) { if (4 == height) {
common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@ -344,8 +326,7 @@ static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter) { int8_t *filter) {
v16u8 src0, src1, src2, src3, src4; v16u8 src0, src1, src2, src3, src4;
@ -364,16 +345,13 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
dst, dst_stride); dst_stride);
} }
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_8x8mult_msa(
int32_t src_stride, const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
uint8_t *dst, int8_t *filter, int32_t height) {
int32_t dst_stride,
int8_t *filter,
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
@ -393,22 +371,22 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
src += (8 * src_stride); src += (8 * src_stride);
LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
vec2, vec3); vec3);
ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
vec6, vec7); vec7);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
dst, dst_stride); dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
src0 = src8; src0 = src8;
@ -416,10 +394,8 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int8_t *filter,
int32_t dst_stride,
int8_t *filter,
int32_t height) { int32_t height) {
if (4 == height) { if (4 == height) {
common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
@ -430,11 +406,9 @@ static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@ -481,11 +455,9 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@ -554,11 +526,9 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
} }
static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter, int8_t *filter, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16u8 src0, src1, src2, src3, src4, src5; v16u8 src0, src1, src2, src3, src4, src5;
v16u8 src6, src7, src8, src9, src10, src11, filt0; v16u8 src6, src7, src8, src9, src10, src11, filt0;
@ -636,8 +606,8 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
int8_t cnt, filt_ver[8]; int8_t cnt, filt_ver[8];
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
@ -650,68 +620,56 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_y)[0] == 0) { if (((const int32_t *)filter_y)[0] == 0) {
switch (w) { switch (w) {
case 4: case 4:
common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_ver[3], h);
&filt_ver[3], h);
break; break;
case 8: case 8:
common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_ver[3], h);
&filt_ver[3], h);
break; break;
case 16: case 16:
common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_ver[3], h);
&filt_ver[3], h);
break; break;
case 32: case 32:
common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_ver[3], h);
&filt_ver[3], h);
break; break;
case 64: case 64:
common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_ver[3], h);
&filt_ver[3], h);
break; break;
default: default:
vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} else { } else {
switch (w) { switch (w) {
case 4: case 4:
common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_ver, h);
filt_ver, h);
break; break;
case 8: case 8:
common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_ver, h);
filt_ver, h);
break; break;
case 16: case 16:
common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_ver, h);
filt_ver, h);
break; break;
case 32: case 32:
common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_ver, h);
filt_ver, h);
break; break;
case 64: case 64:
common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_ver, h);
filt_ver, h);
break; break;
default: default:
vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} }

View File

@ -325,7 +325,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@ -347,7 +347,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@ -355,8 +355,8 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
vec6, vec7); vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
res2, res3); res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
@ -383,7 +383,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@ -406,7 +406,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
@ -482,7 +482,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
LD_SB4(src, src_stride, src0, src2, src4, src6); LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7); LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
@ -545,7 +545,7 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
for (loop_cnt = height >> 1; loop_cnt--;) { for (loop_cnt = height >> 1; loop_cnt--;) {
src0 = LD_SB(src); src0 = LD_SB(src);
@ -590,7 +590,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
/* rearranging filter */ /* rearranging filter */
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
for (loop_cnt = height; loop_cnt--;) { for (loop_cnt = height; loop_cnt--;) {
src0 = LD_SB(src); src0 = LD_SB(src);
@ -622,8 +622,8 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
int8_t cnt, filt_hor[8]; int8_t cnt, filt_hor[8];
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
@ -636,67 +636,55 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_x)[0] == 0) { if (((const int32_t *)filter_x)[0] == 0) {
switch (w) { switch (w) {
case 4: case 4:
common_hz_2t_4w_msa(src, (int32_t)src_stride, common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_hor[3], h); &filt_hor[3], h);
break; break;
case 8: case 8:
common_hz_2t_8w_msa(src, (int32_t)src_stride, common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_hor[3], h); &filt_hor[3], h);
break; break;
case 16: case 16:
common_hz_2t_16w_msa(src, (int32_t)src_stride, common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_hor[3], h); &filt_hor[3], h);
break; break;
case 32: case 32:
common_hz_2t_32w_msa(src, (int32_t)src_stride, common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_hor[3], h); &filt_hor[3], h);
break; break;
case 64: case 64:
common_hz_2t_64w_msa(src, (int32_t)src_stride, common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_hor[3], h); &filt_hor[3], h);
break; break;
default: default:
vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} else { } else {
switch (w) { switch (w) {
case 4: case 4:
common_hz_8t_4w_msa(src, (int32_t)src_stride, common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_hor, h); filt_hor, h);
break; break;
case 8: case 8:
common_hz_8t_8w_msa(src, (int32_t)src_stride, common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_hor, h); filt_hor, h);
break; break;
case 16: case 16:
common_hz_8t_16w_msa(src, (int32_t)src_stride, common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_hor, h); filt_hor, h);
break; break;
case 32: case 32:
common_hz_8t_32w_msa(src, (int32_t)src_stride, common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_hor, h); filt_hor, h);
break; break;
case 64: case 64:
common_hz_8t_64w_msa(src, (int32_t)src_stride, common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_hor, h); filt_hor, h);
break; break;
default: default:
vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} }

View File

@ -69,15 +69,15 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
XORI_B4_128_SB(src7, src8, src9, src10); XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride); src += (4 * src_stride);
hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
@ -151,20 +151,20 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
XORI_B4_128_SB(src7, src8, src9, src10); XORI_B4_128_SB(src7, src8, src9, src10);
hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz0, filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
filt_vt2, filt_vt3); filt_vt2, filt_vt3);
@ -295,11 +295,11 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
vec4, vec5, vec6, vec7); vec5, vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
res2, res3); res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
@ -361,12 +361,10 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
} }
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
int32_t src_stride, int32_t src_stride, uint8_t *dst,
uint8_t *dst,
int32_t dst_stride, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_horiz,
int8_t *filter_vert, int8_t *filter_vert, int32_t height) {
int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, mask, out0, out1; v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
v16u8 filt_hz, filt_vt, vec0; v16u8 filt_hz, filt_vt, vec0;
@ -542,11 +540,10 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
} }
} }
void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
uint8_t *dst, ptrdiff_t dst_stride, ptrdiff_t dst_stride, const int16_t *filter_x,
const int16_t *filter_x, int32_t x_step_q4, int32_t x_step_q4, const int16_t *filter_y,
const int16_t *filter_y, int32_t y_step_q4, int32_t y_step_q4, int32_t w, int32_t h) {
int32_t w, int32_t h) {
int8_t cnt, filt_hor[8], filt_ver[8]; int8_t cnt, filt_hor[8], filt_ver[8];
assert(x_step_q4 == 16); assert(x_step_q4 == 16);
@ -563,72 +560,69 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
((const int32_t *)filter_y)[0] == 0) { ((const int32_t *)filter_y)[0] == 0) {
switch (w) { switch (w) {
case 4: case 4:
common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], (int32_t)h); &filt_ver[3], (int32_t)h);
break; break;
case 8: case 8:
common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], (int32_t)h); &filt_ver[3], (int32_t)h);
break; break;
case 16: case 16:
common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], (int32_t)h); &filt_ver[3], (int32_t)h);
break; break;
case 32: case 32:
common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], (int32_t)h); &filt_ver[3], (int32_t)h);
break; break;
case 64: case 64:
common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, &filt_hor[3],
&filt_hor[3], &filt_ver[3], (int32_t)h); &filt_ver[3], (int32_t)h);
break; break;
default: default:
vpx_convolve8_c(src, src_stride, dst, dst_stride, vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_x, x_step_q4, filter_y, y_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} else if (((const int32_t *)filter_x)[0] == 0 || } else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) { ((const int32_t *)filter_y)[0] == 0) {
vpx_convolve8_c(src, src_stride, dst, dst_stride, vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_x, x_step_q4, filter_y, y_step_q4, filter_y, y_step_q4, w, h);
w, h);
} else { } else {
switch (w) { switch (w) {
case 4: case 4:
common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, filt_ver,
filt_hor, filt_ver, (int32_t)h); (int32_t)h);
break; break;
case 8: case 8:
common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, filt_ver,
filt_hor, filt_ver, (int32_t)h); (int32_t)h);
break; break;
case 16: case 16:
common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, filt_ver,
filt_hor, filt_ver, (int32_t)h); (int32_t)h);
break; break;
case 32: case 32:
common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, filt_ver,
filt_hor, filt_ver, (int32_t)h); (int32_t)h);
break; break;
case 64: case 64:
common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
dst, (int32_t)dst_stride, (int32_t)dst_stride, filt_hor, filt_ver,
filt_hor, filt_ver, (int32_t)h); (int32_t)h);
break; break;
default: default:
vpx_convolve8_c(src, src_stride, dst, dst_stride, vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_x, x_step_q4, filter_y, y_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} }

View File

@ -222,11 +222,11 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src_tmp += (7 * src_stride); src_tmp += (7 * src_stride);
ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
src32_r, src54_r, src21_r); src54_r, src21_r);
ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
src32_l, src54_l, src21_l); src54_l, src21_l);
ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
for (loop_cnt = (height >> 2); loop_cnt--;) { for (loop_cnt = (height >> 2); loop_cnt--;) {
@ -344,8 +344,8 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
src32_r, src43_r); src32_r, src43_r);
ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
src76_r, src87_r); src76_r, src87_r);
ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
src87_r, src76_r, src2110, src4332, src6554, src8776); src76_r, src2110, src4332, src6554, src8776);
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3); tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@ -407,10 +407,10 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
src += (8 * src_stride); src += (8 * src_stride);
ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
vec2, vec3); vec3);
ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
vec6, vec7); vec7);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@ -629,8 +629,8 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, const int16_t *filter_y, int y_step_q4, int w,
int w, int h) { int h) {
int8_t cnt, filt_ver[8]; int8_t cnt, filt_ver[8];
assert(y_step_q4 == 16); assert(y_step_q4 == 16);
@ -643,67 +643,55 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
if (((const int32_t *)filter_y)[0] == 0) { if (((const int32_t *)filter_y)[0] == 0) {
switch (w) { switch (w) {
case 4: case 4:
common_vt_2t_4w_msa(src, (int32_t)src_stride, common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_ver[3], h); &filt_ver[3], h);
break; break;
case 8: case 8:
common_vt_2t_8w_msa(src, (int32_t)src_stride, common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_ver[3], h); &filt_ver[3], h);
break; break;
case 16: case 16:
common_vt_2t_16w_msa(src, (int32_t)src_stride, common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_ver[3], h); &filt_ver[3], h);
break; break;
case 32: case 32:
common_vt_2t_32w_msa(src, (int32_t)src_stride, common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_ver[3], h); &filt_ver[3], h);
break; break;
case 64: case 64:
common_vt_2t_64w_msa(src, (int32_t)src_stride, common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
&filt_ver[3], h); &filt_ver[3], h);
break; break;
default: default:
vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} else { } else {
switch (w) { switch (w) {
case 4: case 4:
common_vt_8t_4w_msa(src, (int32_t)src_stride, common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_ver, h); filt_ver, h);
break; break;
case 8: case 8:
common_vt_8t_8w_msa(src, (int32_t)src_stride, common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_ver, h); filt_ver, h);
break; break;
case 16: case 16:
common_vt_8t_16w_msa(src, (int32_t)src_stride, common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_ver, h); filt_ver, h);
break; break;
case 32: case 32:
common_vt_8t_32w_msa(src, (int32_t)src_stride, common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_ver, h); filt_ver, h);
break; break;
case 64: case 64:
common_vt_8t_64w_msa(src, (int32_t)src_stride, common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
dst, (int32_t)dst_stride,
filt_ver, h); filt_ver, h);
break; break;
default: default:
vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
filter_x, x_step_q4, filter_y, y_step_q4, x_step_q4, filter_y, y_step_q4, w, h);
w, h);
break; break;
} }
} }

View File

@ -10,8 +10,8 @@
#include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/macros_msa.h"
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t dst_stride, int32_t height) {
int32_t cnt; int32_t cnt;
uint32_t out0, out1, out2, out3; uint32_t out0, out1, out2, out3;
v16u8 src0, src1, src2, src3; v16u8 src0, src1, src2, src3;
@ -24,8 +24,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst0, dst1, dst2, dst3); dst2, dst3);
out0 = __msa_copy_u_w((v4i32)dst0, 0); out0 = __msa_copy_u_w((v4i32)dst0, 0);
out1 = __msa_copy_u_w((v4i32)dst1, 0); out1 = __msa_copy_u_w((v4i32)dst1, 0);
@ -53,8 +53,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
} }
} }
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t dst_stride, int32_t height) {
int32_t cnt; int32_t cnt;
uint64_t out0, out1, out2, out3; uint64_t out0, out1, out2, out3;
v16u8 src0, src1, src2, src3; v16u8 src0, src1, src2, src3;
@ -65,8 +65,8 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride); src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst0, dst1, dst2, dst3); dst2, dst3);
out0 = __msa_copy_u_d((v2i64)dst0, 0); out0 = __msa_copy_u_d((v2i64)dst0, 0);
out1 = __msa_copy_u_d((v2i64)dst1, 0); out1 = __msa_copy_u_d((v2i64)dst1, 0);
@ -88,10 +88,10 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
src += (8 * src_stride); src += (8 * src_stride);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst0, dst1, dst2, dst3); dst2, dst3);
AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
dst4, dst5, dst6, dst7); dst6, dst7);
ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
dst += (8 * dst_stride); dst += (8 * dst_stride);
} }
@ -120,14 +120,14 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
dst_dup += (4 * dst_stride); dst_dup += (4 * dst_stride);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst0, dst1, dst2, dst3); dst2, dst3);
AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
dst4, dst5, dst6, dst7); dst6, dst7);
AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
dst8, dst9, dst10, dst11); dst10, dst11);
AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
dst12, dst13, dst14, dst15); dst13, dst14, dst15);
ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
@ -166,14 +166,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
dst_dup += dst_stride; dst_dup += dst_stride;
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst0, dst1, dst2, dst3); dst2, dst3);
AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
dst4, dst5, dst6, dst7); dst6, dst7);
AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
dst8, dst9, dst10, dst11); dst10, dst11);
AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
dst12, dst13, dst14, dst15); dst13, dst14, dst15);
ST_UB4(dst0, dst1, dst2, dst3, dst, 16); ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
dst += dst_stride; dst += dst_stride;

View File

@ -105,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
dst_tmp = dst; dst_tmp = dst;
for (loop_cnt = (height >> 3); loop_cnt--;) { for (loop_cnt = (height >> 3); loop_cnt--;) {
LD_UB8(src_tmp, src_stride, LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
src0, src1, src2, src3, src4, src5, src6, src7); src7);
src_tmp += (8 * src_stride); src_tmp += (8 * src_stride);
ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
dst_tmp, dst_stride); dst_stride);
dst_tmp += (8 * dst_stride); dst_tmp += (8 * dst_stride);
} }

Some files were not shown because too many files have changed in this diff Show More