vp9_reconintra_neon: add DC 8x8 predictors
~90% faster over 20M pixels Change-Id: Iab791510cc57c8332c2f9a5da0ed50702e5f5763
This commit is contained in:
parent
bbea7c95d8
commit
e97b849219
@ -248,9 +248,11 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred8, vp9_dc_predictor_8x8_dspr2, NULL, NULL,
|
||||
#endif // HAVE_DSPR2
|
||||
|
||||
#if HAVE_NEON
|
||||
INTRA_PRED_TEST(NEON, TestIntraPred8, NULL, NULL, NULL, NULL,
|
||||
vp9_v_predictor_8x8_neon, vp9_h_predictor_8x8_neon, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon)
|
||||
INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon,
|
||||
vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon,
|
||||
vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon,
|
||||
vp9_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
|
||||
vp9_tm_predictor_8x8_neon)
|
||||
|
||||
#endif // HAVE_NEON
|
||||
|
||||
|
@ -8,9 +8,85 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// DC 8x8
|
||||
|
||||
// 'do_above' and 'do_left' facilitate branch removal when inlined.
|
||||
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
|
||||
const uint8_t *above, const uint8_t *left,
|
||||
int do_above, int do_left) {
|
||||
uint16x8_t sum_top;
|
||||
uint16x8_t sum_left;
|
||||
uint8x8_t dc0;
|
||||
|
||||
if (do_above) {
|
||||
const uint8x8_t A = vld1_u8(above); // top row
|
||||
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
||||
const uint16x4_t p1 = vpadd_u16(p0, p0);
|
||||
const uint16x4_t p2 = vpadd_u16(p1, p1);
|
||||
sum_top = vcombine_u16(p2, p2);
|
||||
}
|
||||
|
||||
if (do_left) {
|
||||
const uint8x8_t L = vld1_u8(left); // left border
|
||||
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
|
||||
const uint16x4_t p1 = vpadd_u16(p0, p0);
|
||||
const uint16x4_t p2 = vpadd_u16(p1, p1);
|
||||
sum_left = vcombine_u16(p2, p2);
|
||||
}
|
||||
|
||||
if (do_above && do_left) {
|
||||
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
|
||||
dc0 = vrshrn_n_u16(sum, 4);
|
||||
} else if (do_above) {
|
||||
dc0 = vrshrn_n_u16(sum_top, 3);
|
||||
} else if (do_left) {
|
||||
dc0 = vrshrn_n_u16(sum_left, 3);
|
||||
} else {
|
||||
dc0 = vdup_n_u8(0x80);
|
||||
}
|
||||
|
||||
{
|
||||
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
const uint8_t *above, const uint8_t *left) {
|
||||
dc_8x8(dst, stride, above, left, 1, 1);
|
||||
}
|
||||
|
||||
void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
const uint8_t *above, const uint8_t *left) {
|
||||
(void)above;
|
||||
dc_8x8(dst, stride, NULL, left, 0, 1);
|
||||
}
|
||||
|
||||
void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
const uint8_t *above, const uint8_t *left) {
|
||||
(void)left;
|
||||
dc_8x8(dst, stride, above, NULL, 1, 0);
|
||||
}
|
||||
|
||||
void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||
const uint8_t *above, const uint8_t *left) {
|
||||
(void)above;
|
||||
(void)left;
|
||||
dc_8x8(dst, stride, NULL, NULL, 0, 0);
|
||||
}
|
||||
|
||||
#if !HAVE_NEON_ASM
|
||||
|
||||
void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
|
||||
const uint8_t *above, const uint8_t *left) {
|
||||
int i;
|
||||
@ -423,3 +499,4 @@ void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // !HAVE_NEON_ASM
|
||||
|
@ -123,16 +123,16 @@ add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, cons
|
||||
specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
|
||||
specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc";
|
||||
|
||||
add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc";
|
||||
specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc";
|
||||
|
||||
add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc";
|
||||
specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc";
|
||||
|
||||
add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc";
|
||||
specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc";
|
||||
|
||||
add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
|
||||
|
@ -199,8 +199,9 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
|
||||
# TODO(johannkoenig): re-enable when chromium build is fixed
|
||||
# # https://code.google.com/p/chromium/issues/detail?id=443839
|
||||
#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
|
||||
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
|
||||
endif # HAVE_NEON
|
||||
endif # HAVE_NEON_ASM
|
||||
|
||||
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon.c
|
||||
|
||||
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
|
||||
|
Loading…
x
Reference in New Issue
Block a user