Refine 8-bit 16x16 idct NEON intrinsics

Speed test shows 25% gain on vpx_idct16x16_256_add_neon(),
and vpx_idct16x16_10_add_neon() got trippled.

Change-Id: If8518d9b6a3efab74031297b8d40cd83c4a49541
This commit is contained in:
Linfeng Zhang 2017-01-06 17:52:07 -08:00
parent c7e2bd6298
commit 6abdd31555
4 changed files with 853 additions and 1218 deletions

View File

@ -131,8 +131,6 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,
6270 };
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
int32x4_t c0 = vld1q_s32(input);
int32x4_t c1 = vld1q_s32(input + 4);

File diff suppressed because it is too large Load Diff

View File

@ -17,17 +17,21 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/vpx_dsp_common.h"
DECLARE_ALIGNED(16, static const int16_t, kCospi[8]) = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
-9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
-9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
};
DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
-9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
};
@ -462,4 +466,79 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
*io7 = vsubq_s16(step1[0], step2[7]);
}
static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
int16x8_t *const d0,
int16x8_t *const d1) {
int16x4_t t16[4];
t16[0] = vrshrn_n_s32(t32[0], 14);
t16[1] = vrshrn_n_s32(t32[1], 14);
t16[2] = vrshrn_n_s32(t32[2], 14);
t16[3] = vrshrn_n_s32(t32[3], 14);
*d0 = vcombine_s16(t16[0], t16[1]);
*d1 = vcombine_s16(t16[2], t16[3]);
}
static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
const int16x8_t s1,
const int16x4_t cospi_0_8_16_24,
int32x4_t *const t32) {
t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
}
static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
const int16x4_t cospi_0_8_16_24,
int16x8_t *const d0, int16x8_t *const d1) {
int32x4_t t32[4];
idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
idct16x16_add_wrap_low_8x2(t32, d0, d1);
}
static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
const int16x4_t cospi_0_8_16_24,
int16x8_t *const d0,
int16x8_t *const d1) {
int32x4_t t32[4];
idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
t32[2] = vnegq_s32(t32[2]);
t32[3] = vnegq_s32(t32[3]);
idct16x16_add_wrap_low_8x2(t32, d0, d1);
}
static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
const int16x4_t cospi_0_8_16_24,
int16x8_t *const d0,
int16x8_t *const d1) {
int32x4_t t32[6];
t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
idct16x16_add_wrap_low_8x2(t32, d0, d1);
}
static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
const int stride) {
uint8x8_t d = vld1_u8(*dest);
uint16x8_t q;
res = vrshrq_n_s16(res, 6);
q = vaddw_u8(vreinterpretq_u16_s16(res), d);
d = vqmovun_s16(vreinterpretq_s16_u16(q));
vst1_u8(*dest, d);
*dest += stride;
}
#endif // VPX_DSP_ARM_IDCT_NEON_H_

View File

@ -227,6 +227,7 @@ DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_neon.c
else
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
@ -236,7 +237,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c