postproc: vpx_mbpost_proc_across_ip_neon
The speedup is pretty poor. I would be concerned except the SSE2 is worse: Existing SSE2 improvement: 22% New neon improvement: 35% BUG=webm:1320 Change-Id: Ied598a261134aa6cbe69f96f58589d2bae17bf62
This commit is contained in:
parent
e7c453b613
commit
4dca923454
@ -595,6 +595,9 @@ INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest,
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
NEON, VpxPostProcDownAndAcrossMbRowTest,
|
||||
::testing::Values(vpx_post_proc_down_and_across_mb_row_neon));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest,
|
||||
::testing::Values(vpx_mbpost_proc_across_ip_neon));
|
||||
#endif // HAVE_NEON
|
||||
|
||||
#if HAVE_MSA
|
||||
|
@ -257,3 +257,130 @@ void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
|
||||
dst_ptr += 8 * dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
// sum += x;
|
||||
// sumsq += x * y;
|
||||
static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
|
||||
int16x4_t *const sum, int32x4_t *const sumsq) {
|
||||
const int16x4_t zero = vdup_n_s16(0);
|
||||
const int32x4_t zeroq = vdupq_n_s32(0);
|
||||
|
||||
// Add in the first set because vext doesn't work with '0'.
|
||||
*sum = vadd_s16(*sum, x);
|
||||
*sumsq = vaddq_s32(*sumsq, xy);
|
||||
|
||||
// Shift x and xy to the right and sum. vext requires an immediate.
|
||||
*sum = vadd_s16(*sum, vext_s16(zero, x, 1));
|
||||
*sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
|
||||
|
||||
*sum = vadd_s16(*sum, vext_s16(zero, x, 2));
|
||||
*sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
|
||||
|
||||
*sum = vadd_s16(*sum, vext_s16(zero, x, 3));
|
||||
*sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
|
||||
}
|
||||
|
||||
// Generate mask based on (sumsq * 15 - sum * sum < flimit)
|
||||
static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
|
||||
const int32x4_t f, const int32x4_t fifteen) {
|
||||
const int32x4_t a = vmulq_s32(sumsq, fifteen);
|
||||
const int32x4_t b = vmlsl_s16(a, sum, sum);
|
||||
const uint32x4_t mask32 = vcltq_s32(b, f);
|
||||
return vmovn_u32(mask32);
|
||||
}
|
||||
|
||||
static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
|
||||
const int32x4_t sumsq_low,
|
||||
const int32x4_t sumsq_high, const int32x4_t f) {
|
||||
const int32x4_t fifteen = vdupq_n_s32(15);
|
||||
const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
|
||||
const uint16x4_t mask16_high =
|
||||
calculate_mask(sum_high, sumsq_high, f, fifteen);
|
||||
return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
|
||||
}
|
||||
|
||||
// Apply filter of (8 + sum + s[c]) >> 4.
|
||||
static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
|
||||
const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
|
||||
const int16x8_t sum_s = vaddq_s16(sum, s16);
|
||||
|
||||
return vqrshrun_n_s16(sum_s, 4);
|
||||
}
|
||||
|
||||
void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
|
||||
int flimit) {
|
||||
int row, col;
|
||||
const int32x4_t f = vdupq_n_s32(flimit);
|
||||
|
||||
assert(cols % 8 == 0);
|
||||
|
||||
for (row = 0; row < rows; ++row) {
|
||||
// Sum the first 8 elements, which are extended from s[0].
|
||||
// sumsq gets primed with +16.
|
||||
int sumsq = src[0] * src[0] * 9 + 16;
|
||||
int sum = src[0] * 9;
|
||||
|
||||
uint8x8_t left_context, s, right_context;
|
||||
int16x4_t sum_low, sum_high;
|
||||
int32x4_t sumsq_low, sumsq_high;
|
||||
|
||||
// Sum (+square) the next 6 elements.
|
||||
// Skip [0] because it's included above.
|
||||
for (col = 1; col <= 6; ++col) {
|
||||
sumsq += src[col] * src[col];
|
||||
sum += src[col];
|
||||
}
|
||||
|
||||
// Prime the sums. Later the loop uses the _high values to prime the new
|
||||
// vectors.
|
||||
sumsq_high = vdupq_n_s32(sumsq);
|
||||
sum_high = vdup_n_s16(sum);
|
||||
|
||||
// Manually extend the left border.
|
||||
left_context = vdup_n_u8(src[0]);
|
||||
|
||||
for (col = 0; col < cols; col += 8) {
|
||||
uint8x8_t mask, output;
|
||||
int16x8_t x, y;
|
||||
int32x4_t xy_low, xy_high;
|
||||
|
||||
s = vld1_u8(src + col);
|
||||
|
||||
if (col + 8 == cols) {
|
||||
// Last row. Extend border.
|
||||
right_context = vdup_n_u8(src[col + 7]);
|
||||
} else {
|
||||
right_context = vld1_u8(src + col + 7);
|
||||
}
|
||||
|
||||
x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
|
||||
y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
|
||||
xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
|
||||
xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
|
||||
|
||||
// Catch up to the last sum'd value.
|
||||
sum_low = vdup_lane_s16(sum_high, 3);
|
||||
sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
|
||||
|
||||
accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
|
||||
|
||||
// Need to do this sequentially because we need the max value from
|
||||
// sum_low.
|
||||
sum_high = vdup_lane_s16(sum_low, 3);
|
||||
sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
|
||||
|
||||
accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
|
||||
|
||||
mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
|
||||
|
||||
output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
|
||||
output = vbsl_u8(mask, output, s);
|
||||
|
||||
vst1_u8(src + col, output);
|
||||
|
||||
left_context = s;
|
||||
}
|
||||
|
||||
src += pitch;
|
||||
}
|
||||
}
|
||||
|
@ -1753,7 +1753,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
|
||||
specialize qw/vpx_mbpost_proc_down sse2 msa/;
|
||||
|
||||
add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
|
||||
specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
|
||||
specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
|
||||
|
||||
add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
|
||||
specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
|
||||
|
Loading…
x
Reference in New Issue
Block a user