Merge "Clean DC only idct NEON intrinsics"
This commit is contained in:
commit
90f889a56d
@ -11,49 +11,66 @@
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/arm/idct_neon.h"
|
||||
#include "vpx_dsp/inv_txfm.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
|
||||
const uint8x16_t res) {
|
||||
const uint8x16_t a = vld1q_u8(*dest);
|
||||
const uint8x16_t b = vqaddq_u8(a, res);
|
||||
vst1q_u8(*dest, b);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
|
||||
const uint8x16_t res) {
|
||||
const uint8x16_t a = vld1q_u8(*dest);
|
||||
const uint8x16_t b = vqsubq_u8(a, res);
|
||||
vst1q_u8(*dest, b);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
|
||||
int stride) {
|
||||
uint8x8_t d2u8, d3u8, d30u8, d31u8;
|
||||
uint64x1_t d2u64, d3u64, d4u64, d5u64;
|
||||
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
|
||||
int16x8_t q0s16;
|
||||
uint8_t *d1, *d2;
|
||||
int16_t i, j, a1;
|
||||
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
|
||||
out = dct_const_round_shift(out * cospi_16_64);
|
||||
a1 = ROUND_POWER_OF_TWO(out, 6);
|
||||
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
|
||||
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
|
||||
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
|
||||
|
||||
q0s16 = vdupq_n_s16(a1);
|
||||
q0u16 = vreinterpretq_u16_s16(q0s16);
|
||||
|
||||
for (d1 = d2 = dest, i = 0; i < 4; i++) {
|
||||
for (j = 0; j < 2; j++) {
|
||||
d2u64 = vld1_u64((const uint64_t *)d1);
|
||||
d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
|
||||
d1 += stride;
|
||||
d4u64 = vld1_u64((const uint64_t *)d1);
|
||||
d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
|
||||
d1 += stride;
|
||||
|
||||
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
|
||||
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
|
||||
q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
|
||||
q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
|
||||
|
||||
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
|
||||
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
|
||||
d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
|
||||
d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
|
||||
|
||||
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
|
||||
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
|
||||
d2 += stride;
|
||||
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
|
||||
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
|
||||
d2 += stride;
|
||||
}
|
||||
if (a1 >= 0) {
|
||||
const uint8x16_t dc = create_dcq(a1);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_pos_kernel(&dest, stride, dc);
|
||||
} else {
|
||||
const uint8x16_t dc = create_dcq(-a1);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct16x16_1_add_neg_kernel(&dest, stride, dc);
|
||||
}
|
||||
}
|
||||
|
@ -10,127 +10,48 @@
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/arm/idct_neon.h"
|
||||
#include "vpx_dsp/inv_txfm.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
|
||||
uint8x16_t *q9u8, uint8x16_t *q10u8,
|
||||
uint8x16_t *q11u8, uint8x16_t *q12u8,
|
||||
uint8x16_t *q13u8, uint8x16_t *q14u8,
|
||||
uint8x16_t *q15u8) {
|
||||
*q8u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q9u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q10u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q11u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q12u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q13u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q14u8 = vld1q_u8(d);
|
||||
d += d_stride;
|
||||
*q15u8 = vld1q_u8(d);
|
||||
static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
|
||||
const uint8x16_t res) {
|
||||
const uint8x16_t a0 = vld1q_u8(*dest);
|
||||
const uint8x16_t a1 = vld1q_u8(*dest + 16);
|
||||
const uint8x16_t b0 = vqaddq_u8(a0, res);
|
||||
const uint8x16_t b1 = vqaddq_u8(a1, res);
|
||||
vst1q_u8(*dest, b0);
|
||||
vst1q_u8(*dest + 16, b1);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
|
||||
uint8x16_t *q9u8, uint8x16_t *q10u8,
|
||||
uint8x16_t *q11u8, uint8x16_t *q12u8,
|
||||
uint8x16_t *q13u8, uint8x16_t *q14u8,
|
||||
uint8x16_t *q15u8) {
|
||||
*q8u8 = vqaddq_u8(*q8u8, qdiffu8);
|
||||
*q9u8 = vqaddq_u8(*q9u8, qdiffu8);
|
||||
*q10u8 = vqaddq_u8(*q10u8, qdiffu8);
|
||||
*q11u8 = vqaddq_u8(*q11u8, qdiffu8);
|
||||
*q12u8 = vqaddq_u8(*q12u8, qdiffu8);
|
||||
*q13u8 = vqaddq_u8(*q13u8, qdiffu8);
|
||||
*q14u8 = vqaddq_u8(*q14u8, qdiffu8);
|
||||
*q15u8 = vqaddq_u8(*q15u8, qdiffu8);
|
||||
}
|
||||
|
||||
static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
|
||||
uint8x16_t *q9u8, uint8x16_t *q10u8,
|
||||
uint8x16_t *q11u8, uint8x16_t *q12u8,
|
||||
uint8x16_t *q13u8, uint8x16_t *q14u8,
|
||||
uint8x16_t *q15u8) {
|
||||
*q8u8 = vqsubq_u8(*q8u8, qdiffu8);
|
||||
*q9u8 = vqsubq_u8(*q9u8, qdiffu8);
|
||||
*q10u8 = vqsubq_u8(*q10u8, qdiffu8);
|
||||
*q11u8 = vqsubq_u8(*q11u8, qdiffu8);
|
||||
*q12u8 = vqsubq_u8(*q12u8, qdiffu8);
|
||||
*q13u8 = vqsubq_u8(*q13u8, qdiffu8);
|
||||
*q14u8 = vqsubq_u8(*q14u8, qdiffu8);
|
||||
*q15u8 = vqsubq_u8(*q15u8, qdiffu8);
|
||||
}
|
||||
|
||||
static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
|
||||
uint8x16_t *q9u8, uint8x16_t *q10u8,
|
||||
uint8x16_t *q11u8, uint8x16_t *q12u8,
|
||||
uint8x16_t *q13u8, uint8x16_t *q14u8,
|
||||
uint8x16_t *q15u8) {
|
||||
vst1q_u8(d, *q8u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q9u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q10u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q11u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q12u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q13u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q14u8);
|
||||
d += d_stride;
|
||||
vst1q_u8(d, *q15u8);
|
||||
static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
|
||||
const uint8x16_t res) {
|
||||
const uint8x16_t a0 = vld1q_u8(*dest);
|
||||
const uint8x16_t a1 = vld1q_u8(*dest + 16);
|
||||
const uint8x16_t b0 = vqsubq_u8(a0, res);
|
||||
const uint8x16_t b1 = vqsubq_u8(a1, res);
|
||||
vst1q_u8(*dest, b0);
|
||||
vst1q_u8(*dest + 16, b1);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
|
||||
int stride) {
|
||||
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
int i, j, dest_stride8;
|
||||
uint8_t *d;
|
||||
int16_t a1;
|
||||
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
|
||||
int i;
|
||||
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
|
||||
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
|
||||
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
|
||||
|
||||
out = dct_const_round_shift(out * cospi_16_64);
|
||||
a1 = ROUND_POWER_OF_TWO(out, 6);
|
||||
|
||||
dest_stride8 = stride * 8;
|
||||
if (a1 >= 0) { // diff_positive_32_32
|
||||
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
|
||||
q0u8 = vdupq_n_u8((uint8_t)a1);
|
||||
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
|
||||
d = dest;
|
||||
for (j = 0; j < 4; j++) {
|
||||
LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
|
||||
&q15u8);
|
||||
ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
|
||||
&q14u8, &q15u8);
|
||||
ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
|
||||
&q15u8);
|
||||
d += dest_stride8;
|
||||
}
|
||||
if (a1 >= 0) {
|
||||
const uint8x16_t dc = create_dcq(a1);
|
||||
for (i = 0; i < 32; i++) {
|
||||
idct32x32_1_add_pos_kernel(&dest, stride, dc);
|
||||
}
|
||||
} else { // diff_negative_32_32
|
||||
a1 = -a1;
|
||||
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
|
||||
q0u8 = vdupq_n_u8((uint8_t)a1);
|
||||
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
|
||||
d = dest;
|
||||
for (j = 0; j < 4; j++) {
|
||||
LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
|
||||
&q15u8);
|
||||
SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
|
||||
&q14u8, &q15u8);
|
||||
ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8,
|
||||
&q15u8);
|
||||
d += dest_stride8;
|
||||
}
|
||||
} else {
|
||||
const uint8x16_t dc = create_dcq(-a1);
|
||||
for (i = 0; i < 32; i++) {
|
||||
idct32x32_1_add_neg_kernel(&dest, stride, dc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14,28 +14,32 @@
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/inv_txfm.h"
|
||||
|
||||
static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
|
||||
const int16x8_t res,
|
||||
uint32x2_t *const d) {
|
||||
uint16x8_t a;
|
||||
uint8x8_t b;
|
||||
*d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
|
||||
*d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
|
||||
a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
|
||||
b = vqmovun_s16(vreinterpretq_s16_u16(a));
|
||||
vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
|
||||
*dest += stride;
|
||||
vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
|
||||
int stride) {
|
||||
int i;
|
||||
const int16_t out0 = dct_const_round_shift((int16_t)input[0] * cospi_16_64);
|
||||
const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
|
||||
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
|
||||
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
|
||||
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
|
||||
const int16x8_t dc = vdupq_n_s16(a1);
|
||||
uint32x2_t d = vdup_n_u32(0);
|
||||
uint16x8_t a;
|
||||
uint8x8_t b;
|
||||
|
||||
assert(!((intptr_t)dest % sizeof(uint32_t)));
|
||||
assert(!(stride % sizeof(uint32_t)));
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
d = vld1_lane_u32((const uint32_t *)dest, d, 0);
|
||||
d = vld1_lane_u32((const uint32_t *)(dest + stride), d, 1);
|
||||
a = vaddw_u8(vreinterpretq_u16_s16(dc), vreinterpret_u8_u32(d));
|
||||
b = vqmovun_s16(vreinterpretq_s16_u16(a));
|
||||
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 0);
|
||||
dest += stride;
|
||||
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 1);
|
||||
dest += stride;
|
||||
}
|
||||
idct4x4_1_add_kernel(&dest, stride, dc, &d);
|
||||
idct4x4_1_add_kernel(&dest, stride, dc, &d);
|
||||
}
|
||||
|
@ -12,47 +12,53 @@
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/inv_txfm.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
static INLINE uint8x8_t create_dcd(const int16_t dc) {
|
||||
int16x8_t t = vdupq_n_s16(dc);
|
||||
return vqmovun_s16(t);
|
||||
}
|
||||
|
||||
static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
|
||||
const uint8x8_t res) {
|
||||
const uint8x8_t a = vld1_u8(*dest);
|
||||
const uint8x8_t b = vqadd_u8(a, res);
|
||||
vst1_u8(*dest, b);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
|
||||
const uint8x8_t res) {
|
||||
const uint8x8_t a = vld1_u8(*dest);
|
||||
const uint8x8_t b = vqsub_u8(a, res);
|
||||
vst1_u8(*dest, b);
|
||||
*dest += stride;
|
||||
}
|
||||
|
||||
void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
|
||||
int stride) {
|
||||
int i;
|
||||
const int16_t out0 = dct_const_round_shift(input[0] * cospi_16_64);
|
||||
const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64);
|
||||
const int16_t out2 = ROUND_POWER_OF_TWO(out1, 5);
|
||||
const int16x8_t dc = vdupq_n_s16(out2);
|
||||
const uint16x8_t dc_u16 = vreinterpretq_u16_s16(dc);
|
||||
const uint8_t *dst = dest;
|
||||
uint8x8_t d0, d1, d2, d3;
|
||||
uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16;
|
||||
const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
|
||||
const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
|
||||
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
d0 = vld1_u8(dst);
|
||||
dst += stride;
|
||||
d1 = vld1_u8(dst);
|
||||
dst += stride;
|
||||
d2 = vld1_u8(dst);
|
||||
dst += stride;
|
||||
d3 = vld1_u8(dst);
|
||||
dst += stride;
|
||||
|
||||
d0_u16 = vaddw_u8(dc_u16, d0);
|
||||
d1_u16 = vaddw_u8(dc_u16, d1);
|
||||
d2_u16 = vaddw_u8(dc_u16, d2);
|
||||
d3_u16 = vaddw_u8(dc_u16, d3);
|
||||
|
||||
d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
|
||||
d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
|
||||
d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
|
||||
d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
|
||||
|
||||
vst1_u8(dest, d0);
|
||||
dest += stride;
|
||||
vst1_u8(dest, d1);
|
||||
dest += stride;
|
||||
vst1_u8(dest, d2);
|
||||
dest += stride;
|
||||
vst1_u8(dest, d3);
|
||||
dest += stride;
|
||||
if (a1 >= 0) {
|
||||
const uint8x8_t dc = create_dcd(a1);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_pos_kernel(&dest, stride, dc);
|
||||
} else {
|
||||
const uint8x8_t dc = create_dcd(-a1);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
idct8x8_1_add_neg_kernel(&dest, stride, dc);
|
||||
}
|
||||
}
|
||||
|
@ -181,6 +181,12 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
|
||||
vst1_u8(b, b7);
|
||||
}
|
||||
|
||||
static INLINE uint8x16_t create_dcq(const int16_t dc) {
|
||||
// Clip both sides and gcc may compile to assembly 'usat'.
|
||||
const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
|
||||
return vdupq_n_u8((uint8_t)t);
|
||||
}
|
||||
|
||||
static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
|
||||
int16x8_t *const a0,
|
||||
int16x8_t *const a1) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user