vpx/vpx_dsp/arm/subpel_variance_neon.c

/*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"

#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"

#include "vpx_dsp/variance.h"

static const uint8_t bilinear_filters[8][2] = {
  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
};

// Process a block exactly 8 wide and any height.
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      uint8_t *output_ptr,
                                      unsigned int src_pixels_per_line,
                                      int pixel_step,
                                      unsigned int output_height,
                                      const uint8_t *filter) {
  const uint8x8_t f0 = vmov_n_u8(filter[0]);
  const uint8x8_t f1 = vmov_n_u8(filter[1]);
  unsigned int i;
  for (i = 0; i < output_height; ++i) {
    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
    const uint16x8_t a = vmull_u8(src_0, f0);
    const uint16x8_t b = vmlal_u8(a, src_1, f1);
    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
    vst1_u8(&output_ptr[0], out);
    // Next row...
    src_ptr += src_pixels_per_line;
    output_ptr += 8;
  }
}

// Process a block which is a mutiple of 16 wide and any height.
static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
                                       const uint8_t *filter) {
  const uint8x8_t f0 = vmov_n_u8(filter[0]);
  const uint8x8_t f1 = vmov_n_u8(filter[1]);
  unsigned int i, j;
  for (i = 0; i < output_height; ++i) {
    for (j = 0; j < output_width; j += 16) {
      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
    }
    // Next row...
    src_ptr += src_pixels_per_line;
    output_ptr += output_width;
  }
}

// TODO(johannkoenig): support 4xM block sizes.
#define sub_pixel_varianceNxM(n, m)                                      \
  unsigned int vpx_sub_pixel_variance##n##x##m##_neon(                   \
      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
      const uint8_t *dst, int dst_stride, unsigned int *sse) {           \
    DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]);                   \
    DECLARE_ALIGNED(16, uint8_t, temp2[n * m]);                          \
                                                                         \
    if (n == 8) {                                                        \
      var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1),     \
                                bilinear_filters[xoffset]);              \
      var_filter_block2d_bil_w8(fdata3, temp2, n, n, m,                  \
                                bilinear_filters[yoffset]);              \
    } else {                                                             \
      var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \
                                 bilinear_filters[xoffset]);             \
      var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n,              \
                                 bilinear_filters[yoffset]);             \
    }                                                                    \
    return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse);        \
  }

sub_pixel_varianceNxM(8, 4);
sub_pixel_varianceNxM(8, 8);
sub_pixel_varianceNxM(8, 16);
sub_pixel_varianceNxM(16, 8);
sub_pixel_varianceNxM(16, 16);
sub_pixel_varianceNxM(16, 32);
sub_pixel_varianceNxM(32, 16);
sub_pixel_varianceNxM(32, 32);
sub_pixel_varianceNxM(32, 64);
sub_pixel_varianceNxM(64, 32);
sub_pixel_varianceNxM(64, 64);
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00			`/*`
			`* Copyright (c) 2014 The WebM project authors. All Rights Reserved.`
			`*`
			`* Use of this source code is governed by a BSD-style license`
			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
			`* in the file PATENTS. All contributing project authors may`
			`* be found in the AUTHORS file in the root of the source tree.`
			`*/`

			`#include <arm_neon.h>`
Move variance functions to vpx_dsp subpel functions will be moved in another patch. Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce 2015-05-15 11:52:03 -07:00			`#include "./vpx_dsp_rtcd.h"`
Add 64x variance Neon functions Add optimized Neon functions of: vp9_variance32x64 vp9_variance64x32 vp9_variance64x64 On Nexus 7 speed -5 and -6 saw about a 4% increase in perf. Speeds -7 and -8 saw about a 6% increase in perf. Tested on Nexus 7, built with ndk r10d, gcc 4.9. Change-Id: I5a81f13c9897eb927fa39662530f5524a0f768fa 2015-01-13 11:15:24 -08:00			`#include "./vpx_config.h"`
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00
			`#include "vpx_ports/mem.h"`
			`#include "vpx/vpx_integer.h"`

Move sub pixel variance to vpx_dsp Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1 2015-06-05 09:54:19 -07:00			`#include "vpx_dsp/variance.h"`
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00
vp9_variance*.c: make static tables const Change-Id: Ia5044d13c09685c401191fe87fbf90d36203aadd 2015-07-06 15:04:37 -07:00			`static const uint8_t bilinear_filters[8][2] = {`
vpx_dsp: apply clang-format Change-Id: I3ea3e77364879928bd916f2b0a7838073ade5975 2016-07-22 20:07:03 -07:00			`{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },`
			`{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },`
Make vp9 subpixel match vp8 The only difference between the two was that the vp9 function allowed for every step in the bilinear filter (16 steps) while vp8 only allowed for half of those. Since all the call sites in vp9 (<< 1) the input, it only ever used the same steps as vp8. This will allow moving the subpel variance to vpx_dsp with the rest of the variance functions. Change-Id: I6fa2509350a2dc610c46b3e15bde98a15a084b75 2015-05-26 11:30:25 -07:00			`};`
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00
sub pixel variance neon: use generic variance When a neon version is available it will be called. This allows decoupling the variance implementations and has no real downside. For most configurations, the call will be #define'd to the neon implementation. Change-Id: Ibb2afe4e156c5610e89488504d366b3e6d1ba712 2017-05-02 10:25:37 -07:00			`// Process a block exactly 8 wide and any height.`
Neon version of vp9_sub_pixel_variance8x8(), vp9_variance8x8(), and vp9_get8x8var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~1.2%. Change-Id: I8a66ac2a0f550b407caa27816833bdc563395102 2014-08-01 11:35:55 -07:00			`static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,`
			`uint8_t *output_ptr,`
			`unsigned int src_pixels_per_line,`
			`int pixel_step,`
			`unsigned int output_height,`
Move sub pixel variance to vpx_dsp Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1 2015-06-05 09:54:19 -07:00			`const uint8_t *filter) {`
			`const uint8x8_t f0 = vmov_n_u8(filter[0]);`
			`const uint8x8_t f1 = vmov_n_u8(filter[1]);`
Neon version of vp9_sub_pixel_variance8x8(), vp9_variance8x8(), and vp9_get8x8var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~1.2%. Change-Id: I8a66ac2a0f550b407caa27816833bdc563395102 2014-08-01 11:35:55 -07:00			`unsigned int i;`
			`for (i = 0; i < output_height; ++i) {`
			`const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);`
			`const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);`
			`const uint16x8_t a = vmull_u8(src_0, f0);`
			`const uint16x8_t b = vmlal_u8(a, src_1, f1);`
			`const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);`
			`vst1_u8(&output_ptr[0], out);`
			`// Next row...`
			`src_ptr += src_pixels_per_line;`
subpel variance neon: add mixed sizes Add support for everything except block sizes of 4. Performance is better but numbers will improve again when the variance optimizations land. BUG=webm:1423 Change-Id: I92eb4312b20be423fa2fe6fdb18167a604ff4d80 2017-05-03 12:06:29 -07:00			`output_ptr += 8;`
Neon version of vp9_sub_pixel_variance8x8(), vp9_variance8x8(), and vp9_get8x8var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~1.2%. Change-Id: I8a66ac2a0f550b407caa27816833bdc563395102 2014-08-01 11:35:55 -07:00			`}`
			`}`

sub pixel variance neon: use generic variance When a neon version is available it will be called. This allows decoupling the variance implementations and has no real downside. For most configurations, the call will be #define'd to the neon implementation. Change-Id: Ibb2afe4e156c5610e89488504d366b3e6d1ba712 2017-05-02 10:25:37 -07:00			`// Process a block which is a mutiple of 16 wide and any height.`
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00			`static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,`
			`uint8_t *output_ptr,`
			`unsigned int src_pixels_per_line,`
			`int pixel_step,`
			`unsigned int output_height,`
			`unsigned int output_width,`
Move sub pixel variance to vpx_dsp Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1 2015-06-05 09:54:19 -07:00			`const uint8_t *filter) {`
			`const uint8x8_t f0 = vmov_n_u8(filter[0]);`
			`const uint8x8_t f1 = vmov_n_u8(filter[1]);`
Neon version of vp9_sub_pixel_variance32x32(), vp9_variance32x32(), and vp9_get32x32var(). Change-Id: I8137e2540e50984744da59ae3a41e94f8af4a548 2014-07-31 08:00:36 -07:00			`unsigned int i, j;`
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00			`for (i = 0; i < output_height; ++i) {`
Neon version of vp9_sub_pixel_variance32x32(), vp9_variance32x32(), and vp9_get32x32var(). Change-Id: I8137e2540e50984744da59ae3a41e94f8af4a548 2014-07-31 08:00:36 -07:00			`for (j = 0; j < output_width; j += 16) {`
			`const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);`
			`const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);`
			`const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);`
			`const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);`
			`const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);`
			`const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);`
			`const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);`
			`const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);`
			`vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));`
			`}`
Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb 2014-07-29 16:47:34 -07:00			`// Next row...`
			`src_ptr += src_pixels_per_line;`
			`output_ptr += output_width;`
			`}`
			`}`

subpel variance neon: add mixed sizes Add support for everything except block sizes of 4. Performance is better but numbers will improve again when the variance optimizations land. BUG=webm:1423 Change-Id: I92eb4312b20be423fa2fe6fdb18167a604ff4d80 2017-05-03 12:06:29 -07:00			`// TODO(johannkoenig): support 4xM block sizes.`
			`#define sub_pixel_varianceNxM(n, m) \`
			`unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \`
			`const uint8_t *src, int src_stride, int xoffset, int yoffset, \`
			`const uint8_t dst, int dst_stride, unsigned int sse) { \`
			`DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \`
			`DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \`
			`\`
			`if (n == 8) { \`
			`var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \`
			`bilinear_filters[xoffset]); \`
			`var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \`
			`bilinear_filters[yoffset]); \`
			`} else { \`
			`var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \`
			`bilinear_filters[xoffset]); \`
			`var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \`
			`bilinear_filters[yoffset]); \`
			`} \`
			`return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \`
			`}`
Add 64x64 sub_pel_variance Neon function On Nexus 7 speed -5, -6, -7, and -8 saw about a 15% increase in perf for 480p. Speeds -5, -6, -7, and -8 saw about a 10% increase in perf for 720p. Tested on Nexus 7, built with ndk r10d, gcc 4.9. Change-Id: I2fa5315845e3021c9a6e2ea47e52e68b398d8334 2015-01-13 23:01:06 -08:00
subpel variance neon: add mixed sizes Add support for everything except block sizes of 4. Performance is better but numbers will improve again when the variance optimizations land. BUG=webm:1423 Change-Id: I92eb4312b20be423fa2fe6fdb18167a604ff4d80 2017-05-03 12:06:29 -07:00			`sub_pixel_varianceNxM(8, 4);`
			`sub_pixel_varianceNxM(8, 8);`
			`sub_pixel_varianceNxM(8, 16);`
			`sub_pixel_varianceNxM(16, 8);`
			`sub_pixel_varianceNxM(16, 16);`
			`sub_pixel_varianceNxM(16, 32);`
			`sub_pixel_varianceNxM(32, 16);`
			`sub_pixel_varianceNxM(32, 32);`
			`sub_pixel_varianceNxM(32, 64);`
			`sub_pixel_varianceNxM(64, 32);`
			`sub_pixel_varianceNxM(64, 64);`