sad4d neon: 16x[8,16,32]
Rewrite 16x16. Use half the accumulator registers. BUG=webm:1425 Change-Id: I44b48512b1e3629505d83c2645e800f53878ccc2
This commit is contained in:
parent
8152b0904d
commit
807ce8fb1e
@ -677,7 +677,9 @@ INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
|
||||
const SadMxNx4Param x4d_neon_tests[] = {
|
||||
SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
|
||||
SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon),
|
||||
SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon),
|
||||
SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon),
|
||||
SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon),
|
||||
SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon),
|
||||
SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon),
|
||||
SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
|
||||
|
@ -88,6 +88,48 @@ void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
|
||||
sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
|
||||
}
|
||||
|
||||
static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
|
||||
const uint8_t *const b[4], int b_stride,
|
||||
uint32_t *result, const int height) {
|
||||
int i, j;
|
||||
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
const uint8x16_t a_u8 = vld1q_u8(a);
|
||||
a += a_stride;
|
||||
for (j = 0; j < 4; ++j) {
|
||||
const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
|
||||
b_loop[j] += b_stride;
|
||||
sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
|
||||
sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
|
||||
}
|
||||
}
|
||||
|
||||
for (j = 0; j < 4; ++j) {
|
||||
result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t *res) {
|
||||
sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
|
||||
}
|
||||
|
||||
void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t *res) {
|
||||
sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
|
||||
}
|
||||
|
||||
void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t *res) {
|
||||
sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
|
||||
}
|
||||
|
||||
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
|
||||
const uint16x8_t vec_hi) {
|
||||
const uint32x4_t vec_l_lo =
|
||||
@ -241,58 +283,3 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
|
||||
res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
|
||||
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
|
||||
}
|
||||
|
||||
void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t *res) {
|
||||
int i;
|
||||
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
|
||||
uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
|
||||
const uint8_t *ref0, *ref1, *ref2, *ref3;
|
||||
ref0 = ref[0];
|
||||
ref1 = ref[1];
|
||||
ref2 = ref[2];
|
||||
ref3 = ref[3];
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x16_t vec_src = vld1q_u8(src);
|
||||
const uint8x16_t vec_ref0 = vld1q_u8(ref0);
|
||||
const uint8x16_t vec_ref1 = vld1q_u8(ref1);
|
||||
const uint8x16_t vec_ref2 = vld1q_u8(ref2);
|
||||
const uint8x16_t vec_ref3 = vld1q_u8(ref3);
|
||||
|
||||
vec_sum_ref0_lo =
|
||||
vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
|
||||
vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
|
||||
vget_high_u8(vec_ref0));
|
||||
vec_sum_ref1_lo =
|
||||
vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
|
||||
vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
|
||||
vget_high_u8(vec_ref1));
|
||||
vec_sum_ref2_lo =
|
||||
vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
|
||||
vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
|
||||
vget_high_u8(vec_ref2));
|
||||
vec_sum_ref3_lo =
|
||||
vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
|
||||
vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
|
||||
vget_high_u8(vec_ref3));
|
||||
|
||||
src += src_stride;
|
||||
ref0 += ref_stride;
|
||||
ref1 += ref_stride;
|
||||
ref2 += ref_stride;
|
||||
ref3 += ref_stride;
|
||||
}
|
||||
|
||||
res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
|
||||
res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
|
||||
res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
|
||||
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
|
||||
}
|
||||
|
@ -872,13 +872,13 @@ add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, con
|
||||
specialize qw/vpx_sad32x16x4d msa sse2 vsx/;
|
||||
|
||||
add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
|
||||
specialize qw/vpx_sad16x32x4d msa sse2 vsx/;
|
||||
specialize qw/vpx_sad16x32x4d neon msa sse2 vsx/;
|
||||
|
||||
add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
|
||||
specialize qw/vpx_sad16x16x4d neon msa sse2 vsx/;
|
||||
|
||||
add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
|
||||
specialize qw/vpx_sad16x8x4d msa sse2 vsx/;
|
||||
specialize qw/vpx_sad16x8x4d neon msa sse2 vsx/;
|
||||
|
||||
add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
|
||||
specialize qw/vpx_sad8x16x4d neon msa sse2/;
|
||||
|
Loading…
x
Reference in New Issue
Block a user