vpx/vpx_dsp/arm/transpose_neon.h
Johann 377cfa31f0 Extract neon transpose for re-use
Change-Id: I5e1c7f4c80d1c6f7fd582ac468c6eaaa3603a06c
2016-08-04 19:04:25 +00:00

105 lines
3.8 KiB
C

/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_
#define VPX_DSP_ARM_TRANSPOSE_NEON_H_
#include <arm_neon.h>
#include "./vpx_config.h"
// Transpose 64 bit elements as follows:
// a0: 00 01 02 03 04 05 06 07
// a1: 16 17 18 19 20 21 22 23
//
// b0.val[0]: 00 01 02 03 16 17 18 19
// b0.val[1]: 04 05 06 07 20 21 22 23
static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
vreinterpret_s16_s32(vget_high_s32(a1)));
return b0;
}
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 08 09 10 11 12 13 14 15
// a2: 16 17 18 19 20 21 22 23
// a3: 24 25 26 27 28 29 30 31
// a4: 32 33 34 35 36 37 38 39
// a5: 40 41 42 43 44 45 46 47
// a6: 48 49 50 51 52 53 54 55
// a7: 56 57 58 59 60 61 62 63
// to:
// b0.val[0]: 00 08 02 10 04 12 06 14
// b0.val[1]: 01 09 03 11 05 13 07 15
// b1.val[0]: 16 24 18 26 20 28 22 30
// b1.val[1]: 17 25 19 27 21 29 23 31
// b2.val[0]: 32 40 34 42 36 44 38 46
// b2.val[1]: 33 41 35 43 37 45 39 47
// b3.val[0]: 48 56 50 58 52 60 54 62
// b3.val[1]: 49 57 51 59 53 61 55 63
const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
// Swap 32 bit elements resulting in:
// c0.val[0]: 00 08 16 24 04 12 20 28
// c0.val[1]: 02 10 18 26 06 14 22 30
// c1.val[0]: 01 09 17 25 05 13 21 29
// c1.val[1]: 03 11 19 27 07 15 23 31
// c2.val[0]: 32 40 48 56 36 44 52 60
// c2.val[1]: 34 42 50 58 38 46 54 62
// c3.val[0]: 33 41 49 57 37 45 53 61
// c3.val[1]: 35 43 51 59 39 47 55 63
const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
vreinterpretq_s32_s16(b1.val[0]));
const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
vreinterpretq_s32_s16(b1.val[1]));
const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
vreinterpretq_s32_s16(b3.val[0]));
const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
vreinterpretq_s32_s16(b3.val[1]));
// Swap 64 bit elements resulting in:
// d0.val[0]: 00 08 16 24 32 40 48 56
// d0.val[1]: 04 12 20 28 36 44 52 60
// d1.val[0]: 01 09 17 25 33 41 49 57
// d1.val[1]: 05 13 21 29 37 45 53 61
// d2.val[0]: 02 10 18 26 34 42 50 58
// d2.val[1]: 06 14 22 30 38 46 54 62
// d3.val[0]: 03 11 19 27 35 43 51 59
// d3.val[1]: 07 15 23 31 39 47 55 63
const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]);
*a0 = d0.val[0];
*a1 = d1.val[0];
*a2 = d2.val[0];
*a3 = d3.val[0];
*a4 = d0.val[1];
*a5 = d1.val[1];
*a6 = d2.val[1];
*a7 = d3.val[1];
}
#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_