Refine 8-bit 16x16 idct NEON intrinsics
Speed test shows 25% gain on vpx_idct16x16_256_add_neon(), and vpx_idct16x16_10_add_neon() got trippled. Change-Id: If8518d9b6a3efab74031297b8d40cd83c4a49541
This commit is contained in:
parent
c7e2bd6298
commit
6abdd31555
@ -131,8 +131,6 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
|
||||
|
||||
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,
|
||||
6270 };
|
||||
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
|
||||
int32x4_t c0 = vld1q_s32(input);
|
||||
int32x4_t c1 = vld1q_s32(input + 4);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -17,17 +17,21 @@
|
||||
#include "vpx_dsp/arm/transpose_neon.h"
|
||||
#include "vpx_dsp/vpx_dsp_common.h"
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, kCospi[8]) = {
|
||||
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
|
||||
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
|
||||
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
|
||||
-9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
|
||||
DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
|
||||
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
|
||||
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
|
||||
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
|
||||
-9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
|
||||
16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
|
||||
14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
|
||||
15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
|
||||
12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
|
||||
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
|
||||
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
|
||||
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
|
||||
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
|
||||
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
|
||||
-9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
|
||||
};
|
||||
|
||||
@ -462,4 +466,79 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
|
||||
*io7 = vsubq_s16(step1[0], step2[7]);
|
||||
}
|
||||
|
||||
static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
|
||||
int16x8_t *const d0,
|
||||
int16x8_t *const d1) {
|
||||
int16x4_t t16[4];
|
||||
|
||||
t16[0] = vrshrn_n_s32(t32[0], 14);
|
||||
t16[1] = vrshrn_n_s32(t32[1], 14);
|
||||
t16[2] = vrshrn_n_s32(t32[2], 14);
|
||||
t16[3] = vrshrn_n_s32(t32[3], 14);
|
||||
*d0 = vcombine_s16(t16[0], t16[1]);
|
||||
*d1 = vcombine_s16(t16[2], t16[3]);
|
||||
}
|
||||
|
||||
static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
|
||||
const int16x8_t s1,
|
||||
const int16x4_t cospi_0_8_16_24,
|
||||
int32x4_t *const t32) {
|
||||
t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
|
||||
t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
|
||||
t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
|
||||
t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
|
||||
t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
|
||||
t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
|
||||
t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
|
||||
t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
|
||||
}
|
||||
|
||||
static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x4_t cospi_0_8_16_24,
|
||||
int16x8_t *const d0, int16x8_t *const d1) {
|
||||
int32x4_t t32[4];
|
||||
|
||||
idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
|
||||
idct16x16_add_wrap_low_8x2(t32, d0, d1);
|
||||
}
|
||||
|
||||
static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x4_t cospi_0_8_16_24,
|
||||
int16x8_t *const d0,
|
||||
int16x8_t *const d1) {
|
||||
int32x4_t t32[4];
|
||||
|
||||
idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
|
||||
t32[2] = vnegq_s32(t32[2]);
|
||||
t32[3] = vnegq_s32(t32[3]);
|
||||
idct16x16_add_wrap_low_8x2(t32, d0, d1);
|
||||
}
|
||||
|
||||
static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
|
||||
const int16x4_t cospi_0_8_16_24,
|
||||
int16x8_t *const d0,
|
||||
int16x8_t *const d1) {
|
||||
int32x4_t t32[6];
|
||||
|
||||
t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
|
||||
t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
|
||||
t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
|
||||
t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
|
||||
t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
|
||||
t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
|
||||
idct16x16_add_wrap_low_8x2(t32, d0, d1);
|
||||
}
|
||||
|
||||
static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
|
||||
const int stride) {
|
||||
uint8x8_t d = vld1_u8(*dest);
|
||||
uint16x8_t q;
|
||||
|
||||
res = vrshrq_n_s16(res, 6);
|
||||
q = vaddw_u8(vreinterpretq_u16_s16(res), d);
|
||||
d = vqmovun_s16(vreinterpretq_s16_u16(q));
|
||||
vst1_u8(*dest, d);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
#endif // VPX_DSP_ARM_IDCT_NEON_H_
|
||||
|
@ -227,6 +227,7 @@ DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
|
||||
DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
|
||||
DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
|
||||
DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
|
||||
DSP_SRCS-yes += arm/idct16x16_neon.c
|
||||
else
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
|
||||
@ -236,7 +237,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
|
||||
endif # HAVE_NEON_ASM
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
|
||||
|
Loading…
Reference in New Issue
Block a user