neon 4 byte helper functions
When data is guaranteed to be aligned, use helper functions which assert that requirement. Change-Id: Ic4b188593aea0799d5bd8eda64f9858a1592a2a3
This commit is contained in:
parent
8739a182c8
commit
7498fe2e54
@ -20,7 +20,7 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
|
||||
int stride) {
|
||||
const uint8_t *dst = dest;
|
||||
const int16x4_t cospis = vld1_s16(kCospi);
|
||||
uint32x2_t dest01_u32 = vdup_n_u32(0);
|
||||
uint8x8_t dest01_u8;
|
||||
uint32x2_t dest32_u32 = vdup_n_u32(0);
|
||||
int16x8_t a0, a1;
|
||||
uint8x8_t d01, d32;
|
||||
@ -40,25 +40,22 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
|
||||
a0 = vrshrq_n_s16(a0, 4);
|
||||
a1 = vrshrq_n_s16(a1, 4);
|
||||
|
||||
dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
|
||||
dst += stride;
|
||||
dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
|
||||
dst += stride;
|
||||
dest01_u8 = load_u8(dst, stride);
|
||||
dst += 2 * stride;
|
||||
// The elements are loaded in reverse order.
|
||||
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
|
||||
dst += stride;
|
||||
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
|
||||
|
||||
d01_u16 =
|
||||
vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
|
||||
d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
|
||||
d32_u16 =
|
||||
vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
|
||||
d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
|
||||
d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
|
||||
|
||||
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
|
||||
dest += stride;
|
||||
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
|
||||
dest += stride;
|
||||
store_u8(dest, stride, d01);
|
||||
dest += 2 * stride;
|
||||
// The elements are stored in reverse order.
|
||||
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
|
||||
dest += stride;
|
||||
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
|
||||
|
@ -68,4 +68,29 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
|
||||
vst1q_s16(buf, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Load 2 sets of 4 bytes when alignment is guaranteed.
|
||||
static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
|
||||
uint32x2_t a = vdup_n_u32(0);
|
||||
|
||||
assert(!((intptr_t)buf % sizeof(uint32_t)));
|
||||
assert(!(stride % sizeof(uint32_t)));
|
||||
|
||||
a = vld1_lane_u32((const uint32_t *)buf, a, 0);
|
||||
buf += stride;
|
||||
a = vld1_lane_u32((const uint32_t *)buf, a, 1);
|
||||
return vreinterpret_u8_u32(a);
|
||||
}
|
||||
|
||||
// Store 2 sets of 4 bytes when alignment is guaranteed.
|
||||
static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
|
||||
uint32x2_t a_u32 = vreinterpret_u32_u8(a);
|
||||
|
||||
assert(!((intptr_t)buf % sizeof(uint32_t)));
|
||||
assert(!(stride % sizeof(uint32_t)));
|
||||
|
||||
vst1_lane_u32((uint32_t *)buf, a_u32, 0);
|
||||
buf += stride;
|
||||
vst1_lane_u32((uint32_t *)buf, a_u32, 1);
|
||||
}
|
||||
#endif // VPX_DSP_ARM_MEM_NEON_H_
|
||||
|
Loading…
x
Reference in New Issue
Block a user