Update vpx_lpf_vertical_16_dual_neon() intrinsics

Process 16 samples together.

Change-Id: If6ee8e3377aa2786417f2fc411ba7d87ea8b6799
This commit is contained in:
Linfeng Zhang 2016-08-29 15:20:09 -07:00
parent 129814fcb4
commit f7cbfed682
2 changed files with 509 additions and 117 deletions

View File

@ -633,147 +633,347 @@ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int p, const uint8_t *blimit,
oq4, oq5, oq6, flat_u64, flat2_u64);
}
static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int count) {
void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit);
const uint8x8_t limit_u8x8 = vld1_dup_u8(limit);
const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh);
uint8_t *d;
uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6, flat,
hev, mask, flat2;
uint64_t flat_u64, flat2_u64;
s -= 8;
d = s;
do {
uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6,
flat, hev, mask, flat2;
uint64_t flat_u64, flat2_u64;
t0 = vld1q_u8(s);
s += p;
t1 = vld1q_u8(s);
s += p;
t2 = vld1q_u8(s);
s += p;
t3 = vld1q_u8(s);
s += p;
t4 = vld1q_u8(s);
s += p;
t5 = vld1q_u8(s);
s += p;
t6 = vld1q_u8(s);
s += p;
t7 = vld1q_u8(s);
t0 = vld1q_u8(s);
s += p;
t1 = vld1q_u8(s);
s += p;
t2 = vld1q_u8(s);
s += p;
t3 = vld1q_u8(s);
s += p;
t4 = vld1q_u8(s);
s += p;
t5 = vld1q_u8(s);
s += p;
t6 = vld1q_u8(s);
s += p;
t7 = vld1q_u8(s);
s += p;
transpose_u8_16x8(t0, t1, t2, t3, t4, t5, t6, t7, &p7, &p6, &p5, &p4, &p3,
&p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
transpose_u8_16x8(t0, t1, t2, t3, t4, t5, t6, t7, &p7, &p6, &p5, &p4, &p3,
&p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
mask = filter_mask_8(limit_u8x8, blimit_u8x8, thresh_u8x8, p3, p2, p1, p0, q0,
q1, q2, q3, &flat, &hev);
flat2 = flat_mask5_8(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
flat = vand_u8(flat, mask);
flat2 = vand_u8(flat2, flat);
flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0);
flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0);
mask = filter_mask_8(limit_u8x8, blimit_u8x8, thresh_u8x8, p3, p2, p1, p0,
q0, q1, q2, q3, &flat, &hev);
flat2 = flat_mask5_8(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
flat = vand_u8(flat, mask);
flat2 = vand_u8(flat2, flat);
flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0);
flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0);
filter16_8(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3,
&op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6);
filter16_8(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
&op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5,
&oq6);
if (flat_u64) {
if (flat2_u64) {
uint8x16_t o0, o1, o2, o3, o4, o5, o6, o7;
transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
oq3, oq4, oq5, oq6, q7, &o0, &o1, &o2, &o3, &o4, &o5,
&o6, &o7);
if (flat_u64) {
if (flat2_u64) {
uint8x16_t o0, o1, o2, o3, o4, o5, o6, o7;
transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
oq3, oq4, oq5, oq6, q7, &o0, &o1, &o2, &o3, &o4, &o5,
&o6, &o7);
vst1q_u8(d, o0);
d += p;
vst1q_u8(d, o1);
d += p;
vst1q_u8(d, o2);
d += p;
vst1q_u8(d, o3);
d += p;
vst1q_u8(d, o4);
d += p;
vst1q_u8(d, o5);
d += p;
vst1q_u8(d, o6);
d += p;
vst1q_u8(d, o7);
d += p;
} else {
uint8x8x3_t o0, o1;
d += 8;
o0.val[0] = op2;
o0.val[1] = op1;
o0.val[2] = op0;
o1.val[0] = oq0;
o1.val[1] = oq1;
o1.val[2] = oq2;
vst3_lane_u8(d - 3, o0, 0);
vst3_lane_u8(d + 0, o1, 0);
d += p;
vst3_lane_u8(d - 3, o0, 1);
vst3_lane_u8(d + 0, o1, 1);
d += p;
vst3_lane_u8(d - 3, o0, 2);
vst3_lane_u8(d + 0, o1, 2);
d += p;
vst3_lane_u8(d - 3, o0, 3);
vst3_lane_u8(d + 0, o1, 3);
d += p;
vst3_lane_u8(d - 3, o0, 4);
vst3_lane_u8(d + 0, o1, 4);
d += p;
vst3_lane_u8(d - 3, o0, 5);
vst3_lane_u8(d + 0, o1, 5);
d += p;
vst3_lane_u8(d - 3, o0, 6);
vst3_lane_u8(d + 0, o1, 6);
d += p;
vst3_lane_u8(d - 3, o0, 7);
vst3_lane_u8(d + 0, o1, 7);
d += p - 8;
}
vst1q_u8(d, o0);
d += p;
vst1q_u8(d, o1);
d += p;
vst1q_u8(d, o2);
d += p;
vst1q_u8(d, o3);
d += p;
vst1q_u8(d, o4);
d += p;
vst1q_u8(d, o5);
d += p;
vst1q_u8(d, o6);
d += p;
vst1q_u8(d, o7);
} else {
uint8x8x4_t o;
d += 6;
o.val[0] = op1;
o.val[1] = op0;
o.val[2] = oq0;
o.val[3] = oq1;
vst4_lane_u8(d, o, 0);
uint8x8x3_t o0, o1;
d += 8;
o0.val[0] = op2;
o0.val[1] = op1;
o0.val[2] = op0;
o1.val[0] = oq0;
o1.val[1] = oq1;
o1.val[2] = oq2;
vst3_lane_u8(d - 3, o0, 0);
vst3_lane_u8(d + 0, o1, 0);
d += p;
vst4_lane_u8(d, o, 1);
vst3_lane_u8(d - 3, o0, 1);
vst3_lane_u8(d + 0, o1, 1);
d += p;
vst4_lane_u8(d, o, 2);
vst3_lane_u8(d - 3, o0, 2);
vst3_lane_u8(d + 0, o1, 2);
d += p;
vst4_lane_u8(d, o, 3);
vst3_lane_u8(d - 3, o0, 3);
vst3_lane_u8(d + 0, o1, 3);
d += p;
vst4_lane_u8(d, o, 4);
vst3_lane_u8(d - 3, o0, 4);
vst3_lane_u8(d + 0, o1, 4);
d += p;
vst4_lane_u8(d, o, 5);
vst3_lane_u8(d - 3, o0, 5);
vst3_lane_u8(d + 0, o1, 5);
d += p;
vst4_lane_u8(d, o, 6);
vst3_lane_u8(d - 3, o0, 6);
vst3_lane_u8(d + 0, o1, 6);
d += p;
vst4_lane_u8(d, o, 7);
d += p - 6;
vst3_lane_u8(d - 3, o0, 7);
vst3_lane_u8(d + 0, o1, 7);
}
} while (--count);
}
void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 1);
} else {
uint8x8x4_t o;
d += 6;
o.val[0] = op1;
o.val[1] = op0;
o.val[2] = oq0;
o.val[3] = oq1;
vst4_lane_u8(d, o, 0);
d += p;
vst4_lane_u8(d, o, 1);
d += p;
vst4_lane_u8(d, o, 2);
d += p;
vst4_lane_u8(d, o, 3);
d += p;
vst4_lane_u8(d, o, 4);
d += p;
vst4_lane_u8(d, o, 5);
d += p;
vst4_lane_u8(d, o, 6);
d += p;
vst4_lane_u8(d, o, 7);
}
}
void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 2);
const uint8x16_t blimit_u8x16 = vld1q_dup_u8(blimit);
const uint8x16_t limit_u8x16 = vld1q_dup_u8(limit);
const uint8x16_t thresh_u8x16 = vld1q_dup_u8(thresh);
uint8_t *d;
uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
t15;
uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6,
flat, hev, mask, flat2;
uint64x1_t flat_u64x1, flat2_u64x1;
uint64_t flat_u64, flat2_u64;
s -= 8;
d = s;
t0 = vld1q_u8(s);
s += p;
t1 = vld1q_u8(s);
s += p;
t2 = vld1q_u8(s);
s += p;
t3 = vld1q_u8(s);
s += p;
t4 = vld1q_u8(s);
s += p;
t5 = vld1q_u8(s);
s += p;
t6 = vld1q_u8(s);
s += p;
t7 = vld1q_u8(s);
s += p;
t8 = vld1q_u8(s);
s += p;
t9 = vld1q_u8(s);
s += p;
t10 = vld1q_u8(s);
s += p;
t11 = vld1q_u8(s);
s += p;
t12 = vld1q_u8(s);
s += p;
t13 = vld1q_u8(s);
s += p;
t14 = vld1q_u8(s);
s += p;
t15 = vld1q_u8(s);
transpose_u8_16x16(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
t14, t15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
&q2, &q3, &q4, &q5, &q6, &q7);
mask = filter_mask_16(limit_u8x16, blimit_u8x16, thresh_u8x16, p3, p2, p1, p0,
q0, q1, q2, q3, &flat, &hev);
flat2 = flat_mask5_16(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
flat = vandq_u8(flat, mask);
flat2 = vandq_u8(flat2, flat);
flat_u64x1 = vadd_u64(vreinterpret_u64_u8(vget_low_u8(flat)),
vreinterpret_u64_u8(vget_high_u8(flat)));
flat2_u64x1 = vadd_u64(vreinterpret_u64_u8(vget_low_u8(flat2)),
vreinterpret_u64_u8(vget_high_u8(flat2)));
flat_u64 = vget_lane_u64(flat_u64x1, 0);
flat2_u64 = vget_lane_u64(flat2_u64x1, 0);
filter16_16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
&op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6);
if (flat_u64) {
if (flat2_u64) {
uint8x16_t o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13,
o14, o15;
transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
oq3, oq4, oq5, oq6, q7, &o0, &o1, &o2, &o3, &o4, &o5,
&o6, &o7, &o8, &o9, &o10, &o11, &o12, &o13, &o14,
&o15);
vst1q_u8(d, o0);
d += p;
vst1q_u8(d, o1);
d += p;
vst1q_u8(d, o2);
d += p;
vst1q_u8(d, o3);
d += p;
vst1q_u8(d, o4);
d += p;
vst1q_u8(d, o5);
d += p;
vst1q_u8(d, o6);
d += p;
vst1q_u8(d, o7);
d += p;
vst1q_u8(d, o8);
d += p;
vst1q_u8(d, o9);
d += p;
vst1q_u8(d, o10);
d += p;
vst1q_u8(d, o11);
d += p;
vst1q_u8(d, o12);
d += p;
vst1q_u8(d, o13);
d += p;
vst1q_u8(d, o14);
d += p;
vst1q_u8(d, o15);
} else {
uint8x8x3_t o0, o1;
d += 8;
o0.val[0] = vget_low_u8(op2);
o0.val[1] = vget_low_u8(op1);
o0.val[2] = vget_low_u8(op0);
o1.val[0] = vget_low_u8(oq0);
o1.val[1] = vget_low_u8(oq1);
o1.val[2] = vget_low_u8(oq2);
vst3_lane_u8(d - 3, o0, 0);
vst3_lane_u8(d + 0, o1, 0);
d += p;
vst3_lane_u8(d - 3, o0, 1);
vst3_lane_u8(d + 0, o1, 1);
d += p;
vst3_lane_u8(d - 3, o0, 2);
vst3_lane_u8(d + 0, o1, 2);
d += p;
vst3_lane_u8(d - 3, o0, 3);
vst3_lane_u8(d + 0, o1, 3);
d += p;
vst3_lane_u8(d - 3, o0, 4);
vst3_lane_u8(d + 0, o1, 4);
d += p;
vst3_lane_u8(d - 3, o0, 5);
vst3_lane_u8(d + 0, o1, 5);
d += p;
vst3_lane_u8(d - 3, o0, 6);
vst3_lane_u8(d + 0, o1, 6);
d += p;
vst3_lane_u8(d - 3, o0, 7);
vst3_lane_u8(d + 0, o1, 7);
d += p;
o0.val[0] = vget_high_u8(op2);
o0.val[1] = vget_high_u8(op1);
o0.val[2] = vget_high_u8(op0);
o1.val[0] = vget_high_u8(oq0);
o1.val[1] = vget_high_u8(oq1);
o1.val[2] = vget_high_u8(oq2);
vst3_lane_u8(d - 3, o0, 0);
vst3_lane_u8(d + 0, o1, 0);
d += p;
vst3_lane_u8(d - 3, o0, 1);
vst3_lane_u8(d + 0, o1, 1);
d += p;
vst3_lane_u8(d - 3, o0, 2);
vst3_lane_u8(d + 0, o1, 2);
d += p;
vst3_lane_u8(d - 3, o0, 3);
vst3_lane_u8(d + 0, o1, 3);
d += p;
vst3_lane_u8(d - 3, o0, 4);
vst3_lane_u8(d + 0, o1, 4);
d += p;
vst3_lane_u8(d - 3, o0, 5);
vst3_lane_u8(d + 0, o1, 5);
d += p;
vst3_lane_u8(d - 3, o0, 6);
vst3_lane_u8(d + 0, o1, 6);
d += p;
vst3_lane_u8(d - 3, o0, 7);
vst3_lane_u8(d + 0, o1, 7);
}
} else {
uint8x8x4_t o;
d += 6;
o.val[0] = vget_low_u8(op1);
o.val[1] = vget_low_u8(op0);
o.val[2] = vget_low_u8(oq0);
o.val[3] = vget_low_u8(oq1);
vst4_lane_u8(d, o, 0);
d += p;
vst4_lane_u8(d, o, 1);
d += p;
vst4_lane_u8(d, o, 2);
d += p;
vst4_lane_u8(d, o, 3);
d += p;
vst4_lane_u8(d, o, 4);
d += p;
vst4_lane_u8(d, o, 5);
d += p;
vst4_lane_u8(d, o, 6);
d += p;
vst4_lane_u8(d, o, 7);
d += p;
o.val[0] = vget_high_u8(op1);
o.val[1] = vget_high_u8(op0);
o.val[2] = vget_high_u8(oq0);
o.val[3] = vget_high_u8(oq1);
vst4_lane_u8(d, o, 0);
d += p;
vst4_lane_u8(d, o, 1);
d += p;
vst4_lane_u8(d, o, 2);
d += p;
vst4_lane_u8(d, o, 3);
d += p;
vst4_lane_u8(d, o, 4);
d += p;
vst4_lane_u8(d, o, 5);
d += p;
vst4_lane_u8(d, o, 6);
d += p;
vst4_lane_u8(d, o, 7);
}
}

View File

@ -30,6 +30,15 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
return b0;
}
static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
uint8x16x2_t b0;
b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
vreinterpret_u8_u32(vget_low_u32(a1)));
b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
vreinterpret_u8_u32(vget_high_u32(a1)));
return b0;
}
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
@ -316,4 +325,187 @@ static INLINE void transpose_u8_8x16(
*o7 = vreinterpretq_u8_u32(d3.val[1]);
}
static INLINE void transpose_u8_16x16(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
uint8x16_t *o15) {
// Swap 8 bit elements. Goes from:
// i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
// i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
// i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
// i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
// i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
// i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
// i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
// i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
// i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
// i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
// i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
// i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
// i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
// i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
// i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
// i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
// to:
// b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
// b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
// b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
// b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
// b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
// b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
// b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
// b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
// b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
// b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
// b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
// b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
// b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
// b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
// b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
// b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
// Swap 16 bit elements resulting in:
// c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
// c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
// c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
// c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
// c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
// c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
// c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
// c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
// c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
// c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
// c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
// c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
// c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
// c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
// c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
// c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
vreinterpretq_u16_u8(b1.val[0]));
const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
vreinterpretq_u16_u8(b1.val[1]));
const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
vreinterpretq_u16_u8(b3.val[0]));
const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
vreinterpretq_u16_u8(b3.val[1]));
const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
vreinterpretq_u16_u8(b5.val[0]));
const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
vreinterpretq_u16_u8(b5.val[1]));
const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
vreinterpretq_u16_u8(b7.val[0]));
const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
vreinterpretq_u16_u8(b7.val[1]));
// Swap 32 bit elements resulting in:
// d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
// d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
// d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
// d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
// d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
// d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
// d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
// d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
// d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
// d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
// d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
// d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
// d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
// d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
// d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
// d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c2.val[0]));
const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c2.val[1]));
const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
vreinterpretq_u32_u16(c3.val[0]));
const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
vreinterpretq_u32_u16(c3.val[1]));
const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
vreinterpretq_u32_u16(c6.val[0]));
const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
vreinterpretq_u32_u16(c6.val[1]));
const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
vreinterpretq_u32_u16(c7.val[0]));
const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
vreinterpretq_u32_u16(c7.val[1]));
// Swap 64 bit elements resulting in:
// e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
// e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
// e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
// e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
// e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
// e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
// e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
// e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
// e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
// e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
// e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
// e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
// e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
// e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
// e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
// e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
const uint8x16x2_t e0 = vpx_vtrnq_u64(d0.val[0], d4.val[0]);
const uint8x16x2_t e1 = vpx_vtrnq_u64(d2.val[0], d6.val[0]);
const uint8x16x2_t e2 = vpx_vtrnq_u64(d1.val[0], d5.val[0]);
const uint8x16x2_t e3 = vpx_vtrnq_u64(d3.val[0], d7.val[0]);
const uint8x16x2_t e4 = vpx_vtrnq_u64(d0.val[1], d4.val[1]);
const uint8x16x2_t e5 = vpx_vtrnq_u64(d2.val[1], d6.val[1]);
const uint8x16x2_t e6 = vpx_vtrnq_u64(d1.val[1], d5.val[1]);
const uint8x16x2_t e7 = vpx_vtrnq_u64(d3.val[1], d7.val[1]);
// Output:
// o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
// o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
// o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
// o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
// o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
// o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
// o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
// o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
// o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
// o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
// o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
// o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
// o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
// o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
// o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
// o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
*o0 = e0.val[0];
*o1 = e1.val[0];
*o2 = e2.val[0];
*o3 = e3.val[0];
*o4 = e4.val[0];
*o5 = e5.val[0];
*o6 = e6.val[0];
*o7 = e7.val[0];
*o8 = e0.val[1];
*o9 = e1.val[1];
*o10 = e2.val[1];
*o11 = e3.val[1];
*o12 = e4.val[1];
*o13 = e5.val[1];
*o14 = e6.val[1];
*o15 = e7.val[1];
}
#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_