From ec1d8387e11ff95b5766a32c8e33c7f078d8bde4 Mon Sep 17 00:00:00 2001 From: Frank Galligan Date: Tue, 13 Jan 2015 23:01:06 -0800 Subject: [PATCH] Add 64x64 sub_pel_variance Neon function On Nexus 7 speed -5, -6, -7, and -8 saw about a 15% increase in perf for 480p. Speeds -5, -6, -7, and -8 saw about a 10% increase in perf for 720p. Tested on Nexus 7, built with ndk r10d, gcc 4.9. Change-Id: I2fa5315845e3021c9a6e2ea47e52e68b398d8334 --- test/variance_test.cc | 5 ++++- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/arm/neon/vp9_variance_neon.c | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index e7517a7d8..a8dd7de13 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1932,11 +1932,14 @@ const vp9_subpixvariance_fn_t subpel_variance16x16_neon = vp9_sub_pixel_variance16x16_neon; const vp9_subpixvariance_fn_t subpel_variance32x32_neon = vp9_sub_pixel_variance32x32_neon; +const vp9_subpixvariance_fn_t subpel_variance64x64_neon = + vp9_sub_pixel_variance64x64_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9SubpelVarianceTest, ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0), make_tuple(4, 4, subpel_variance16x16_neon, 0), - make_tuple(5, 5, subpel_variance32x32_neon, 0))); + make_tuple(5, 5, subpel_variance32x32_neon, 0), + make_tuple(6, 6, subpel_variance64x64_neon, 0))); #endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index b59e6ebe7..11df21f07 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -837,7 +837,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_ specialize qw/vp9_variance4x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index 567b7deb1..3e1b3de8c 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -31,6 +31,7 @@ enum { kHeight32 = 32 }; enum { kHeight32PlusOne = 33 }; enum { kWidth64 = 64 }; enum { kHeight64 = 64 }; +enum { kHeight64PlusOne = 65 }; enum { kPixelStepOne = 1 }; enum { kAlign16 = 16 }; @@ -252,3 +253,21 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); } + +unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight64PlusOne, kWidth64, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64, + kWidth64, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse); +}