Merge "Clean DC only idct NEON intrinsics"

This commit is contained in:
Linfeng Zhang 2017-01-06 01:16:18 +00:00 committed by Gerrit Code Review
commit 90f889a56d
5 changed files with 156 additions and 202 deletions

View File

@ -11,49 +11,66 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
const uint8x16_t res) {
const uint8x16_t a = vld1q_u8(*dest);
const uint8x16_t b = vqaddq_u8(a, res);
vst1q_u8(*dest, b);
*dest += stride;
}
static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
const uint8x16_t res) {
const uint8x16_t a = vld1q_u8(*dest);
const uint8x16_t b = vqsubq_u8(a, res);
vst1q_u8(*dest, b);
*dest += stride;
}
void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
int16x8_t q0s16;
uint8_t *d1, *d2;
int16_t i, j, a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
q0s16 = vdupq_n_s16(a1);
q0u16 = vreinterpretq_u16_s16(q0s16);
for (d1 = d2 = dest, i = 0; i < 4; i++) {
for (j = 0; j < 2; j++) {
d2u64 = vld1_u64((const uint64_t *)d1);
d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += stride;
d4u64 = vld1_u64((const uint64_t *)d1);
d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += stride;
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
d2 += stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
d2 += stride;
}
if (a1 >= 0) {
const uint8x16_t dc = create_dcq(a1);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
idct16x16_1_add_pos_kernel(&dest, stride, dc);
} else {
const uint8x16_t dc = create_dcq(-a1);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
idct16x16_1_add_neg_kernel(&dest, stride, dc);
}
}

View File

@ -10,127 +10,48 @@
#include <arm_neon.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vld1q_u8(d);
d += d_stride;
*q9u8 = vld1q_u8(d);
d += d_stride;
*q10u8 = vld1q_u8(d);
d += d_stride;
*q11u8 = vld1q_u8(d);
d += d_stride;
*q12u8 = vld1q_u8(d);
d += d_stride;
*q13u8 = vld1q_u8(d);
d += d_stride;
*q14u8 = vld1q_u8(d);
d += d_stride;
*q15u8 = vld1q_u8(d);
static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
const uint8x16_t res) {
const uint8x16_t a0 = vld1q_u8(*dest);
const uint8x16_t a1 = vld1q_u8(*dest + 16);
const uint8x16_t b0 = vqaddq_u8(a0, res);
const uint8x16_t b1 = vqaddq_u8(a1, res);
vst1q_u8(*dest, b0);
vst1q_u8(*dest + 16, b1);
*dest += stride;
}
static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqaddq_u8(*q8u8, qdiffu8);
*q9u8 = vqaddq_u8(*q9u8, qdiffu8);
*q10u8 = vqaddq_u8(*q10u8, qdiffu8);
*q11u8 = vqaddq_u8(*q11u8, qdiffu8);
*q12u8 = vqaddq_u8(*q12u8, qdiffu8);
*q13u8 = vqaddq_u8(*q13u8, qdiffu8);
*q14u8 = vqaddq_u8(*q14u8, qdiffu8);
*q15u8 = vqaddq_u8(*q15u8, qdiffu8);
}
static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqsubq_u8(*q8u8, qdiffu8);
*q9u8 = vqsubq_u8(*q9u8, qdiffu8);
*q10u8 = vqsubq_u8(*q10u8, qdiffu8);
*q11u8 = vqsubq_u8(*q11u8, qdiffu8);
*q12u8 = vqsubq_u8(*q12u8, qdiffu8);
*q13u8 = vqsubq_u8(*q13u8, qdiffu8);
*q14u8 = vqsubq_u8(*q14u8, qdiffu8);
*q15u8 = vqsubq_u8(*q15u8, qdiffu8);
}
static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
vst1q_u8(d, *q8u8);
d += d_stride;
vst1q_u8(d, *q9u8);
d += d_stride;
vst1q_u8(d, *q10u8);
d += d_stride;
vst1q_u8(d, *q11u8);
d += d_stride;
vst1q_u8(d, *q12u8);
d += d_stride;
vst1q_u8(d, *q13u8);
d += d_stride;
vst1q_u8(d, *q14u8);
d += d_stride;
vst1q_u8(d, *q15u8);
static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
const uint8x16_t res) {
const uint8x16_t a0 = vld1q_u8(*dest);
const uint8x16_t a1 = vld1q_u8(*dest + 16);
const uint8x16_t b0 = vqsubq_u8(a0, res);
const uint8x16_t b1 = vqsubq_u8(a1, res);
vst1q_u8(*dest, b0);
vst1q_u8(*dest + 16, b1);
*dest += stride;
}
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int i, j, dest_stride8;
uint8_t *d;
int16_t a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
int i;
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
dest_stride8 = stride * 8;
if (a1 >= 0) { // diff_positive_32_32
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8((uint8_t)a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
&q15u8);
ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
&q15u8);
d += dest_stride8;
}
if (a1 >= 0) {
const uint8x16_t dc = create_dcq(a1);
for (i = 0; i < 32; i++) {
idct32x32_1_add_pos_kernel(&dest, stride, dc);
}
} else { // diff_negative_32_32
a1 = -a1;
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8((uint8_t)a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
&q15u8);
SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
&q15u8);
d += dest_stride8;
}
} else {
const uint8x16_t dc = create_dcq(-a1);
for (i = 0; i < 32; i++) {
idct32x32_1_add_neg_kernel(&dest, stride, dc);
}
}
}

View File

@ -14,28 +14,32 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
const int16x8_t res,
uint32x2_t *const d) {
uint16x8_t a;
uint8x8_t b;
*d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
*d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
b = vqmovun_s16(vreinterpretq_s16_u16(a));
vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
*dest += stride;
vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
*dest += stride;
}
void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
int i;
const int16_t out0 = dct_const_round_shift((int16_t)input[0] * cospi_16_64);
const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
uint32x2_t d = vdup_n_u32(0);
uint16x8_t a;
uint8x8_t b;
assert(!((intptr_t)dest % sizeof(uint32_t)));
assert(!(stride % sizeof(uint32_t)));
for (i = 0; i < 2; i++) {
d = vld1_lane_u32((const uint32_t *)dest, d, 0);
d = vld1_lane_u32((const uint32_t *)(dest + stride), d, 1);
a = vaddw_u8(vreinterpretq_u16_s16(dc), vreinterpret_u8_u32(d));
b = vqmovun_s16(vreinterpretq_s16_u16(a));
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 0);
dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 1);
dest += stride;
}
idct4x4_1_add_kernel(&dest, stride, dc, &d);
idct4x4_1_add_kernel(&dest, stride, dc, &d);
}

View File

@ -12,47 +12,53 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static INLINE uint8x8_t create_dcd(const int16_t dc) {
int16x8_t t = vdupq_n_s16(dc);
return vqmovun_s16(t);
}
static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
const uint8x8_t res) {
const uint8x8_t a = vld1_u8(*dest);
const uint8x8_t b = vqadd_u8(a, res);
vst1_u8(*dest, b);
*dest += stride;
}
static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
const uint8x8_t res) {
const uint8x8_t a = vld1_u8(*dest);
const uint8x8_t b = vqsub_u8(a, res);
vst1_u8(*dest, b);
*dest += stride;
}
void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
int i;
const int16_t out0 = dct_const_round_shift(input[0] * cospi_16_64);
const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
const int16_t out2 = ROUND_POWER_OF_TWO(out1, 5);
const int16x8_t dc = vdupq_n_s16(out2);
const uint16x8_t dc_u16 = vreinterpretq_u16_s16(dc);
const uint8_t *dst = dest;
uint8x8_t d0, d1, d2, d3;
uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16;
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
for (i = 0; i < 2; i++) {
d0 = vld1_u8(dst);
dst += stride;
d1 = vld1_u8(dst);
dst += stride;
d2 = vld1_u8(dst);
dst += stride;
d3 = vld1_u8(dst);
dst += stride;
d0_u16 = vaddw_u8(dc_u16, d0);
d1_u16 = vaddw_u8(dc_u16, d1);
d2_u16 = vaddw_u8(dc_u16, d2);
d3_u16 = vaddw_u8(dc_u16, d3);
d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
vst1_u8(dest, d0);
dest += stride;
vst1_u8(dest, d1);
dest += stride;
vst1_u8(dest, d2);
dest += stride;
vst1_u8(dest, d3);
dest += stride;
if (a1 >= 0) {
const uint8x8_t dc = create_dcd(a1);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
idct8x8_1_add_pos_kernel(&dest, stride, dc);
} else {
const uint8x8_t dc = create_dcd(-a1);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
idct8x8_1_add_neg_kernel(&dest, stride, dc);
}
}

View File

@ -181,6 +181,12 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
vst1_u8(b, b7);
}
static INLINE uint8x16_t create_dcq(const int16_t dc) {
// Clip both sides and gcc may compile to assembly 'usat'.
const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
return vdupq_n_u8((uint8_t)t);
}
static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
int16x8_t *const a0,
int16x8_t *const a1) {