Merge changes Ie15e301e,Ib070c79b

* changes:
  vp9_reconintra_neon: cosmetics: reindent
  vp9_reconintra_neon: cosmetics: drop unneeded returns
This commit is contained in:
James Zern 2015-05-23 17:47:51 +00:00 committed by Gerrit Code Review
commit 02fda6582c

View File

@ -11,463 +11,415 @@
#include <stddef.h> #include <stddef.h>
#include <arm_neon.h> #include <arm_neon.h>
void vp9_v_predictor_4x4_neon( void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int i;
const uint8_t *above, uint32x2_t d0u32 = vdup_n_u32(0);
const uint8_t *left) { (void)left;
int i;
uint32x2_t d0u32 = vdup_n_u32(0);
(void)left;
d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
for (i = 0; i < 4; i++, dst += y_stride) for (i = 0; i < 4; i++, dst += y_stride)
vst1_lane_u32((uint32_t *)dst, d0u32, 0); vst1_lane_u32((uint32_t *)dst, d0u32, 0);
return;
} }
void vp9_v_predictor_8x8_neon( void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int i;
const uint8_t *above, uint8x8_t d0u8 = vdup_n_u8(0);
const uint8_t *left) { (void)left;
int i;
uint8x8_t d0u8 = vdup_n_u8(0);
(void)left;
d0u8 = vld1_u8(above); d0u8 = vld1_u8(above);
for (i = 0; i < 8; i++, dst += y_stride) for (i = 0; i < 8; i++, dst += y_stride)
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
return;
} }
void vp9_v_predictor_16x16_neon( void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int i;
const uint8_t *above, uint8x16_t q0u8 = vdupq_n_u8(0);
const uint8_t *left) { (void)left;
int i;
uint8x16_t q0u8 = vdupq_n_u8(0);
(void)left;
q0u8 = vld1q_u8(above); q0u8 = vld1q_u8(above);
for (i = 0; i < 16; i++, dst += y_stride) for (i = 0; i < 16; i++, dst += y_stride)
vst1q_u8(dst, q0u8); vst1q_u8(dst, q0u8);
return;
} }
void vp9_v_predictor_32x32_neon( void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int i;
const uint8_t *above, uint8x16_t q0u8 = vdupq_n_u8(0);
const uint8_t *left) { uint8x16_t q1u8 = vdupq_n_u8(0);
int i; (void)left;
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)left;
q0u8 = vld1q_u8(above); q0u8 = vld1q_u8(above);
q1u8 = vld1q_u8(above + 16); q1u8 = vld1q_u8(above + 16);
for (i = 0; i < 32; i++, dst += y_stride) { for (i = 0; i < 32; i++, dst += y_stride) {
vst1q_u8(dst, q0u8); vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q1u8); vst1q_u8(dst + 16, q1u8);
} }
return;
} }
void vp9_h_predictor_4x4_neon( void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, uint8x8_t d0u8 = vdup_n_u8(0);
const uint8_t *above, uint32x2_t d1u32 = vdup_n_u32(0);
const uint8_t *left) { (void)above;
uint8x8_t d0u8 = vdup_n_u8(0);
uint32x2_t d1u32 = vdup_n_u32(0);
(void)above;
d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
return;
} }
void vp9_h_predictor_8x8_neon( void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, uint8x8_t d0u8 = vdup_n_u8(0);
const uint8_t *above, uint64x1_t d1u64 = vdup_n_u64(0);
const uint8_t *left) { (void)above;
uint8x8_t d0u8 = vdup_n_u8(0);
uint64x1_t d1u64 = vdup_n_u64(0);
(void)above;
d1u64 = vld1_u64((const uint64_t *)left); d1u64 = vld1_u64((const uint64_t *)left);
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
dst += y_stride; dst += y_stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
vst1_u8(dst, d0u8); vst1_u8(dst, d0u8);
return;
} }
void vp9_h_predictor_16x16_neon( void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int j;
const uint8_t *above, uint8x8_t d2u8 = vdup_n_u8(0);
const uint8_t *left) { uint8x16_t q0u8 = vdupq_n_u8(0);
int j; uint8x16_t q1u8 = vdupq_n_u8(0);
uint8x8_t d2u8 = vdup_n_u8(0); (void)above;
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)above;
q1u8 = vld1q_u8(left);
d2u8 = vget_low_u8(q1u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
q0u8 = vdupq_lane_u8(d2u8, 0);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 1);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 2);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 3);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 4);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 5);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 6);
vst1q_u8(dst, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 7);
vst1q_u8(dst, q0u8);
dst += y_stride;
}
}
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
const uint8_t *above, const uint8_t *left) {
int j, k;
uint8x8_t d2u8 = vdup_n_u8(0);
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)above;
for (k = 0; k < 2; k++, left += 16) {
q1u8 = vld1q_u8(left); q1u8 = vld1q_u8(left);
d2u8 = vget_low_u8(q1u8); d2u8 = vget_low_u8(q1u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
q0u8 = vdupq_lane_u8(d2u8, 0); q0u8 = vdupq_lane_u8(d2u8, 0);
vst1q_u8(dst, q0u8); vst1q_u8(dst, q0u8);
dst += y_stride; vst1q_u8(dst + 16, q0u8);
q0u8 = vdupq_lane_u8(d2u8, 1); dst += y_stride;
vst1q_u8(dst, q0u8); q0u8 = vdupq_lane_u8(d2u8, 1);
dst += y_stride; vst1q_u8(dst, q0u8);
q0u8 = vdupq_lane_u8(d2u8, 2); vst1q_u8(dst + 16, q0u8);
vst1q_u8(dst, q0u8); dst += y_stride;
dst += y_stride; q0u8 = vdupq_lane_u8(d2u8, 2);
q0u8 = vdupq_lane_u8(d2u8, 3); vst1q_u8(dst, q0u8);
vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8);
dst += y_stride; dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 4); q0u8 = vdupq_lane_u8(d2u8, 3);
vst1q_u8(dst, q0u8); vst1q_u8(dst, q0u8);
dst += y_stride; vst1q_u8(dst + 16, q0u8);
q0u8 = vdupq_lane_u8(d2u8, 5); dst += y_stride;
vst1q_u8(dst, q0u8); q0u8 = vdupq_lane_u8(d2u8, 4);
dst += y_stride; vst1q_u8(dst, q0u8);
q0u8 = vdupq_lane_u8(d2u8, 6); vst1q_u8(dst + 16, q0u8);
vst1q_u8(dst, q0u8); dst += y_stride;
dst += y_stride; q0u8 = vdupq_lane_u8(d2u8, 5);
q0u8 = vdupq_lane_u8(d2u8, 7); vst1q_u8(dst, q0u8);
vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8);
dst += y_stride; dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 6);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 7);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
} }
return; }
} }
void vp9_h_predictor_32x32_neon( void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int i;
const uint8_t *above, uint16x8_t q1u16, q3u16;
const uint8_t *left) { int16x8_t q1s16;
int j, k; uint8x8_t d0u8 = vdup_n_u8(0);
uint8x8_t d2u8 = vdup_n_u8(0); uint32x2_t d2u32 = vdup_n_u32(0);
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)above;
for (k = 0; k < 2; k++, left += 16) { d0u8 = vdup_n_u8(above[-1]);
q1u8 = vld1q_u8(left); d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
d2u8 = vget_low_u8(q1u8); q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { for (i = 0; i < 4; i++, dst += y_stride) {
q0u8 = vdupq_lane_u8(d2u8, 0); q1u16 = vdupq_n_u16((uint16_t)left[i]);
vst1q_u8(dst, q0u8); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
vst1q_u8(dst + 16, q0u8); vreinterpretq_s16_u16(q3u16));
dst += y_stride; d0u8 = vqmovun_s16(q1s16);
q0u8 = vdupq_lane_u8(d2u8, 1); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
vst1q_u8(dst, q0u8); }
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 2);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 3);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 4);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 5);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 6);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
q0u8 = vdupq_lane_u8(d2u8, 7);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += y_stride;
}
}
return;
} }
void vp9_tm_predictor_4x4_neon( void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int j;
const uint8_t *above, uint16x8_t q0u16, q3u16, q10u16;
const uint8_t *left) { int16x8_t q0s16;
int i; uint16x4_t d20u16;
uint16x8_t q1u16, q3u16; uint8x8_t d0u8, d2u8, d30u8;
int16x8_t q1s16;
uint8x8_t d0u8 = vdup_n_u8(0);
uint32x2_t d2u32 = vdup_n_u32(0);
d0u8 = vdup_n_u8(above[-1]); d0u8 = vdup_n_u8(above[-1]);
d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); d30u8 = vld1_u8(left);
q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); d2u8 = vld1_u8(above);
for (i = 0; i < 4; i++, dst += y_stride) { q10u16 = vmovl_u8(d30u8);
q1u16 = vdupq_n_u16((uint16_t)left[i]); q3u16 = vsubl_u8(d2u8, d0u8);
q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), d20u16 = vget_low_u16(q10u16);
vreinterpretq_s16_u16(q3u16)); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
d0u8 = vqmovun_s16(q1s16); q0u16 = vdupq_lane_u16(d20u16, 0);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
} vreinterpretq_s16_u16(q0u16));
return; d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d20u16, 1);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d20u16, 2);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d20u16, 3);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += y_stride;
}
} }
void vp9_tm_predictor_8x8_neon( void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int j, k;
const uint8_t *above, uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
const uint8_t *left) { uint8x16_t q0u8, q1u8;
int j; int16x8_t q0s16, q1s16, q8s16, q11s16;
uint16x8_t q0u16, q3u16, q10u16; uint16x4_t d20u16;
int16x8_t q0s16; uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
uint16x4_t d20u16;
uint8x8_t d0u8, d2u8, d30u8;
d0u8 = vdup_n_u8(above[-1]); q0u8 = vdupq_n_u8(above[-1]);
d30u8 = vld1_u8(left); q1u8 = vld1q_u8(above);
d2u8 = vld1_u8(above); q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
q10u16 = vmovl_u8(d30u8); q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
q3u16 = vsubl_u8(d2u8, d0u8); for (k = 0; k < 2; k++, left += 8) {
d18u8 = vld1_u8(left);
q10u16 = vmovl_u8(d18u8);
d20u16 = vget_low_u16(q10u16); d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0); q0u16 = vdupq_lane_u16(d20u16, 0);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), q8u16 = vdupq_lane_u16(d20u16, 1);
vreinterpretq_s16_u16(q0u16)); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
d0u8 = vqmovun_s16(q0s16); vreinterpretq_s16_u16(q2u16));
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
dst += y_stride; vreinterpretq_s16_u16(q3u16));
q0u16 = vdupq_lane_u16(d20u16, 1); q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q2u16));
vreinterpretq_s16_u16(q0u16)); q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
d0u8 = vqmovun_s16(q0s16); vreinterpretq_s16_u16(q3u16));
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); d2u8 = vqmovun_s16(q1s16);
dst += y_stride; d3u8 = vqmovun_s16(q0s16);
q0u16 = vdupq_lane_u16(d20u16, 2); d22u8 = vqmovun_s16(q11s16);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), d23u8 = vqmovun_s16(q8s16);
vreinterpretq_s16_u16(q0u16)); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += y_stride;
dst += y_stride; vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
q0u16 = vdupq_lane_u16(d20u16, 3); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), dst += y_stride;
vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16); q0u16 = vdupq_lane_u16(d20u16, 2);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); q8u16 = vdupq_lane_u16(d20u16, 3);
dst += y_stride; q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q2u16));
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q3u16));
q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
vreinterpretq_s16_u16(q2u16));
q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
d23u8 = vqmovun_s16(q8s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
dst += y_stride;
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
dst += y_stride;
} }
return; }
} }
void vp9_tm_predictor_16x16_neon( void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
uint8_t *dst, const uint8_t *above, const uint8_t *left) {
ptrdiff_t y_stride, int j, k;
const uint8_t *above, uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
const uint8_t *left) { uint8x16_t q0u8, q1u8, q2u8;
int j, k; int16x8_t q12s16, q13s16, q14s16, q15s16;
uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; uint16x4_t d6u16;
uint8x16_t q0u8, q1u8; uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
int16x8_t q0s16, q1s16, q8s16, q11s16;
uint16x4_t d20u16;
uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
q0u8 = vdupq_n_u8(above[-1]); q0u8 = vdupq_n_u8(above[-1]);
q1u8 = vld1q_u8(above); q1u8 = vld1q_u8(above);
q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); q2u8 = vld1q_u8(above + 16);
q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
for (k = 0; k < 2; k++, left += 8) { q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
d18u8 = vld1_u8(left); q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
q10u16 = vmovl_u8(d18u8); q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
d20u16 = vget_low_u16(q10u16); for (k = 0; k < 4; k++, left += 8) {
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { d26u8 = vld1_u8(left);
q0u16 = vdupq_lane_u16(d20u16, 0); q3u16 = vmovl_u8(d26u8);
q8u16 = vdupq_lane_u16(d20u16, 1); d6u16 = vget_low_u16(q3u16);
q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
vreinterpretq_s16_u16(q2u16)); q0u16 = vdupq_lane_u16(d6u16, 0);
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q3u16)); vreinterpretq_s16_u16(q8u16));
q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q2u16)); vreinterpretq_s16_u16(q9u16));
q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q3u16)); vreinterpretq_s16_u16(q10u16));
d2u8 = vqmovun_s16(q1s16); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
d3u8 = vqmovun_s16(q0s16); vreinterpretq_s16_u16(q11u16));
d22u8 = vqmovun_s16(q11s16); d0u8 = vqmovun_s16(q12s16);
d23u8 = vqmovun_s16(q8s16); d1u8 = vqmovun_s16(q13s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); d2u8 = vqmovun_s16(q14s16);
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); d3u8 = vqmovun_s16(q15s16);
dst += y_stride; q0u8 = vcombine_u8(d0u8, d1u8);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); q1u8 = vcombine_u8(d2u8, d3u8);
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
dst += y_stride; vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d20u16, 2); q0u16 = vdupq_lane_u16(d6u16, 1);
q8u16 = vdupq_lane_u16(d20u16, 3); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
vreinterpretq_s16_u16(q2u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
vreinterpretq_s16_u16(q3u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q10u16));
vreinterpretq_s16_u16(q2u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q11u16));
vreinterpretq_s16_u16(q3u16)); d0u8 = vqmovun_s16(q12s16);
d2u8 = vqmovun_s16(q1s16); d1u8 = vqmovun_s16(q13s16);
d3u8 = vqmovun_s16(q0s16); d2u8 = vqmovun_s16(q14s16);
d22u8 = vqmovun_s16(q11s16); d3u8 = vqmovun_s16(q15s16);
d23u8 = vqmovun_s16(q8s16); q0u8 = vcombine_u8(d0u8, d1u8);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); q1u8 = vcombine_u8(d2u8, d3u8);
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
dst += y_stride; vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); dst += y_stride;
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
dst += y_stride; q0u16 = vdupq_lane_u16(d6u16, 2);
} q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d6u16, 3);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
} }
return; }
}
void vp9_tm_predictor_32x32_neon(
uint8_t *dst,
ptrdiff_t y_stride,
const uint8_t *above,
const uint8_t *left) {
int j, k;
uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
uint8x16_t q0u8, q1u8, q2u8;
int16x8_t q12s16, q13s16, q14s16, q15s16;
uint16x4_t d6u16;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
q0u8 = vdupq_n_u8(above[-1]);
q1u8 = vld1q_u8(above);
q2u8 = vld1q_u8(above + 16);
q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
for (k = 0; k < 4; k++, left += 8) {
d26u8 = vld1_u8(left);
q3u16 = vmovl_u8(d26u8);
d6u16 = vget_low_u16(q3u16);
for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
q0u16 = vdupq_lane_u16(d6u16, 0);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d6u16, 1);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d6u16, 2);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
q0u16 = vdupq_lane_u16(d6u16, 3);
q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q8u16));
q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += y_stride;
}
}
return;
} }