Replace DST1 with DST2 for ext-tx experiment

A small gain (0.1 - 0.2%) with this experiment on derflr/hevcmr.

The DST2 can be implemened very efficiently using sign flipping
of odd indexed inputs, followed by DCT, followed by reversal of
the output. This is how it is implemented in this patch.
SIMD optimization is pending.

Change-Id: Ic2fc211ce0e6b7c6702974d76d6573f55cc4da0e
This commit is contained in:
Debargha Mukherjee 2015-12-09 19:12:09 -08:00
parent d7eb423a72
commit e6790e30c5
4 changed files with 1284 additions and 387 deletions

File diff suppressed because it is too large Load Diff

View File

@ -82,27 +82,6 @@ static const tran_high_t sinpi_2_9 = 9929;
static const tran_high_t sinpi_3_9 = 13377;
static const tran_high_t sinpi_4_9 = 15212;
#if CONFIG_EXT_TX
static const int32_t dst_lookup4[] = {
// {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
// at precision of 2 * DCT_CONST_BITS bits
141124871, 228344838,
};
static const int32_t dst_lookup8[] = {
// {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
// at precision of 2 * DCT_CONST_BITS bits
86559612, 162678858, 219176632, 249238470
};
static const int32_t dst_lookup16[] = {
// {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
// at precision of 2 * DCT_CONST_BITS bits
47852167, 94074787, 137093803, 175444254,
207820161, 233119001, 250479254, 259309736
};
#endif // CONFIG_EXT_TX
static INLINE tran_low_t check_range(tran_high_t input) {
#if CONFIG_VP9_HIGHBITDEPTH
// For valid highbitdepth VP9 streams, intermediate stage coefficients will
@ -169,6 +148,220 @@ static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
return clip_pixel(WRAPLOW(dest + trans, 8));
}
#if CONFIG_EXT_TX
#define USE_DST2 1
#if USE_DST2
static const tran_high_t Tx4[4 * 4] = {
// dst2
6270, 15137, 15137, 6270,
11585, 11585, -11585, -11585,
15137, -6270, -6270, 15137,
11585, -11585, 11585, -11585,
};
static const tran_high_t Tx8[8 * 8] = {
// dst2
3196, 9102, 13623, 16069, 16069, 13623, 9102, 3196,
6270, 15137, 15137, 6270, -6270, -15137, -15137, -6270,
9102, 16069, 3196, -13623, -13623, 3196, 16069, 9102,
11585, 11585, -11585, -11585, 11585, 11585, -11585, -11585,
13623, 3196, -16069, 9102, 9102, -16069, 3196, 13623,
15137, -6270, -6270, 15137, -15137, 6270, 6270, -15137,
16069, -13623, 9102, -3196, -3196, 9102, -13623, 16069,
11585, -11585, 11585, -11585, 11585, -11585, 11585, -11585,
};
static const tran_high_t Tx16[16 * 16] = {
// dst2
1606, 4756, 7723, 10394, 12665, 14449, 15679, 16305,
16305, 15679, 14449, 12665, 10394, 7723, 4756, 1606,
3196, 9102, 13623, 16069, 16069, 13623, 9102, 3196,
-3196, -9102, -13623, -16069, -16069, -13623, -9102, -3196,
4756, 12665, 16305, 14449, 7723, -1606, -10394, -15679,
-15679, -10394, -1606, 7723, 14449, 16305, 12665, 4756,
6270, 15137, 15137, 6270, -6270, -15137, -15137, -6270,
6270, 15137, 15137, 6270, -6270, -15137, -15137, -6270,
7723, 16305, 10394, -4756, -15679, -12665, 1606, 14449,
14449, 1606, -12665, -15679, -4756, 10394, 16305, 7723,
9102, 16069, 3196, -13623, -13623, 3196, 16069, 9102,
-9102, -16069, -3196, 13623, 13623, -3196, -16069, -9102,
10394, 14449, -4756, -16305, -1606, 15679, 7723, -12665,
-12665, 7723, 15679, -1606, -16305, -4756, 14449, 10394,
11585, 11585, -11585, -11585, 11585, 11585, -11585, -11585,
11585, 11585, -11585, -11585, 11585, 11585, -11585, -11585,
12665, 7723, -15679, -1606, 16305, -4756, -14449, 10394,
10394, -14449, -4756, 16305, -1606, -15679, 7723, 12665,
13623, 3196, -16069, 9102, 9102, -16069, 3196, 13623,
-13623, -3196, 16069, -9102, -9102, 16069, -3196, -13623,
14449, -1606, -12665, 15679, -4756, -10394, 16305, -7723,
-7723, 16305, -10394, -4756, 15679, -12665, -1606, 14449,
15137, -6270, -6270, 15137, -15137, 6270, 6270, -15137,
15137, -6270, -6270, 15137, -15137, 6270, 6270, -15137,
15679, -10394, 1606, 7723, -14449, 16305, -12665, 4756,
4756, -12665, 16305, -14449, 7723, 1606, -10394, 15679,
16069, -13623, 9102, -3196, -3196, 9102, -13623, 16069,
-16069, 13623, -9102, 3196, 3196, -9102, 13623, -16069,
16305, -15679, 14449, -12665, 10394, -7723, 4756, -1606,
-1606, 4756, -7723, 10394, -12665, 14449, -15679, 16305,
11585, -11585, 11585, -11585, 11585, -11585, 11585, -11585,
11585, -11585, 11585, -11585, 11585, -11585, 11585, -11585,
};
#endif // USE_DST2
static INLINE void vp9_fgentx4(const tran_low_t *input, tran_low_t *output,
const tran_high_t *T) {
tran_high_t sum;
int i;
const tran_high_t *Tx = T;
for (i = 0; i < 4; ++i, Tx += 4) {
sum = Tx[0] * input[0] + Tx[1] * input[1] +
Tx[2] * input[2] + Tx[3] * input[3];
output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
}
}
static INLINE void vp9_fgentx8(const tran_low_t *input, tran_low_t *output,
const tran_high_t *T) {
tran_high_t sum;
int i;
const tran_high_t *Tx = T;
for (i = 0; i < 8; ++i, Tx += 8) {
sum = Tx[0] * input[0] + Tx[1] * input[1] +
Tx[2] * input[2] + Tx[3] * input[3] +
Tx[4] * input[4] + Tx[5] * input[5] +
Tx[6] * input[6] + Tx[7] * input[7];
output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
}
}
static INLINE void vp9_fgentx16(const tran_low_t *input, tran_low_t *output,
const tran_high_t *T) {
tran_high_t sum;
int i;
const tran_high_t *Tx = T;
for (i = 0; i < 16; ++i, Tx += 16) {
sum = Tx[0] * input[0] + Tx[1] * input[1] +
Tx[2] * input[2] + Tx[3] * input[3] +
Tx[4] * input[4] + Tx[5] * input[5] +
Tx[6] * input[6] + Tx[7] * input[7] +
Tx[8] * input[8] + Tx[9] * input[9] +
Tx[10] * input[10] + Tx[11] * input[11] +
Tx[12] * input[12] + Tx[13] * input[13] +
Tx[14] * input[14] + Tx[15] * input[15];
output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
}
}
static INLINE void vp9_igentx4(const tran_low_t *input, tran_low_t *output,
const tran_high_t *T) {
tran_high_t sum[4];
int i;
const tran_high_t *Tx = T;
for (i = 0; i < 4; ++i, ++Tx) {
sum[i] = Tx[0] * input[0] + Tx[4] * input[1] +
Tx[8] * input[2] + Tx[12] * input[3];
}
for (i = 0; i < 4; ++i) {
output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
}
}
static INLINE void vp9_igentx8(const tran_low_t *input, tran_low_t *output,
const tran_high_t *T) {
tran_high_t sum[8];
int i;
const tran_high_t *Tx = T;
for (i = 0; i < 8; ++i, ++Tx) {
sum[i] = Tx[0] * input[0] + Tx[8] * input[1] +
Tx[16] * input[2] + Tx[24] * input[3] +
Tx[32] * input[4] + Tx[40] * input[5] +
Tx[48] * input[6] + Tx[56] * input[7];
}
for (i = 0; i < 8; ++i) {
output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
}
}
static INLINE void vp9_igentx16(const tran_low_t *input, tran_low_t *output,
const tran_high_t *T) {
tran_high_t sum[16];
int i;
const tran_high_t *Tx = T;
for (i = 0; i < 16; ++i, ++Tx) {
sum[i] = Tx[0] * input[0] + Tx[16] * input[1] +
Tx[32] * input[2] + Tx[48] * input[3] +
Tx[64] * input[4] + Tx[80] * input[5] +
Tx[96] * input[6] + Tx[112] * input[7] +
Tx[128] * input[8] + Tx[144] * input[9] +
Tx[160] * input[10] + Tx[176] * input[11] +
Tx[192] * input[12] + Tx[208] * input[13] +
Tx[224] * input[14] + Tx[240] * input[15];
}
for (i = 0; i < 16; ++i) {
output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE void vp9_highbd_igentx4(const tran_low_t *input,
tran_low_t *output,
int bd, const tran_high_t *T) {
tran_high_t sum[4];
int i;
const tran_high_t *Tx = T;
(void) bd;
for (i = 0; i < 4; ++i, Tx += 1) {
sum[i] = Tx[0] * input[0] + Tx[4] * input[1] +
Tx[8] * input[2] + Tx[12] * input[3];
}
for (i = 0; i < 4; ++i) {
output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
}
}
static INLINE void vp9_highbd_igentx8(const tran_low_t *input,
tran_low_t *output,
int bd, const tran_high_t *T) {
tran_high_t sum[8];
int i;
const tran_high_t *Tx = T;
(void) bd;
for (i = 0; i < 8; ++i, Tx += 1) {
sum[i] = Tx[0] * input[0] + Tx[8] * input[1] +
Tx[16] * input[2] + Tx[24] * input[3] +
Tx[32] * input[4] + Tx[40] * input[5] +
Tx[48] * input[6] + Tx[56] * input[7];
}
for (i = 0; i < 8; ++i) {
output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
}
}
static INLINE void vp9_highbd_igentx16(const tran_low_t *input,
tran_low_t *output,
int bd, const tran_high_t *T) {
tran_high_t sum[16];
int i;
const tran_high_t *Tx = T;
(void) bd;
for (i = 0; i < 16; ++i, Tx += 1) {
sum[i] = Tx[0] * input[0] + Tx[16] * input[1] +
Tx[32] * input[2] + Tx[48] * input[3] +
Tx[64] * input[4] + Tx[80] * input[5] +
Tx[96] * input[6] + Tx[112] * input[7] +
Tx[128] * input[8] + Tx[144] * input[9] +
Tx[160] * input[10] + Tx[176] * input[11] +
Tx[192] * input[12] + Tx[208] * input[13] +
Tx[224] * input[14] + Tx[240] * input[15];
}
for (i = 0; i < 16; ++i) {
output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_EXT_TX
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

View File

@ -1874,9 +1874,6 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd,
int is_global;
gm = &xd->global_motion[mi->mbmi.ref_frame[ref]][0];
#endif // CONFIG_GLOBAL_MOTION
#if CONFIG_INTRABC
assert(!is_intrabc || mi->mbmi.interp_filter == BILINEAR);
#endif // CONFIG_INTRABC
const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
struct buf_2d *const dst_buf = &pd->dst;
@ -1910,6 +1907,9 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd,
int xs, ys, subpel_x, subpel_y;
const int is_scaled = vp9_is_scaled(sf);
(void) dst_buf;
#if CONFIG_INTRABC
assert(!is_intrabc || mi->mbmi.interp_filter == BILINEAR);
#endif // CONFIG_INTRABC
#if CONFIG_GLOBAL_MOTION
is_global = (get_y_mode(mi, block) == ZEROMV &&
@ -2017,10 +2017,10 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
struct macroblockd_plane *const pd = &xd->plane[plane];
const MODE_INFO *mi = xd->mi[0].src_mi;
const int is_compound = has_second_ref(&mi->mbmi);
int ref;
#if CONFIG_INTRABC
const int is_intrabc = is_intrabc_mode(mi->mbmi.mode);
#endif // CONFIG_INTRABC
int ref;
#if CONFIG_GLOBAL_MOTION
Global_Motion_Params *gm[2];
gm[0] = &xd->global_motion[mi->mbmi.ref_frame[0]][0];

View File

@ -28,7 +28,38 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
}
#if CONFIG_EXT_TX
void vp9_fklt4(const tran_low_t *input, tran_low_t *output) {
vp9_fgentx4(input, output, Tx4);
}
void vp9_fklt8(const tran_low_t *input, tran_low_t *output) {
vp9_fgentx8(input, output, Tx8);
}
void vp9_fklt16(const tran_low_t *input, tran_low_t *output) {
vp9_fgentx16(input, output, Tx16);
}
void vp9_fdst4(const tran_low_t *input, tran_low_t *output) {
#if USE_DST2
// vp9_fgentx4(input, output, Tx4);
tran_high_t step[4];
tran_high_t temp1, temp2;
step[0] = input[0] - input[3];
step[1] = -input[1] + input[2];
step[2] = -input[1] - input[2];
step[3] = input[0] + input[3];
temp1 = (step[0] + step[1]) * cospi_16_64;
temp2 = (step[0] - step[1]) * cospi_16_64;
output[3] = fdct_round_shift(temp1);
output[1] = fdct_round_shift(temp2);
temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
output[2] = fdct_round_shift(temp1);
output[0] = fdct_round_shift(temp2);
#else
// {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
static const int32_t sinvalue_lookup[] = {
141124871, 228344838,
@ -46,9 +77,61 @@ void vp9_fdst4(const tran_low_t *input, tran_low_t *output) {
output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
#endif
}
void vp9_fdst8(const tran_low_t *input, tran_low_t *output) {
#if USE_DST2
// vp9_fgentx8(input, output, Tx8);
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
tran_high_t t0, t1, t2, t3; // needs32
tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] - input[7];
s1 = -input[1] + input[6];
s2 = input[2] - input[5];
s3 = -input[3] + input[4];
s4 = -input[3] - input[4];
s5 = input[2] + input[5];
s6 = -input[1] - input[6];
s7 = input[0] + input[7];
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
x3 = s0 - s3;
t0 = (x0 + x1) * cospi_16_64;
t1 = (x0 - x1) * cospi_16_64;
t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
output[7] = fdct_round_shift(t0);
output[5] = fdct_round_shift(t2);
output[3] = fdct_round_shift(t1);
output[1] = fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
t1 = (s6 + s5) * cospi_16_64;
t2 = fdct_round_shift(t0);
t3 = fdct_round_shift(t1);
// Stage 3
x0 = s4 + t2;
x1 = s4 - t2;
x2 = s7 - t3;
x3 = s7 + t3;
// Stage 4
t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
output[6] = fdct_round_shift(t0);
output[4] = fdct_round_shift(t2);
output[2] = fdct_round_shift(t1);
output[0] = fdct_round_shift(t3);
#else
// {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
static const int sinvalue_lookup[] = {
86559612, 162678858, 219176632, 249238470
@ -84,9 +167,151 @@ void vp9_fdst8(const tran_low_t *input, tran_low_t *output) {
sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
#endif
}
void vp9_fdst16(const tran_low_t *input, tran_low_t *output) {
#if USE_DST2
// vp9_fgentx16(input, output, Tx16);
tran_high_t step1[8]; // canbe16
tran_high_t step2[8]; // canbe16
tran_high_t step3[8]; // canbe16
tran_high_t in[8]; // canbe16
tran_high_t temp1, temp2; // needs32
// step 1
in[0] = input[0] - input[15];
in[1] = -input[1] + input[14];
in[2] = input[2] - input[13];
in[3] = -input[3] + input[12];
in[4] = input[4] - input[11];
in[5] = -input[5] + input[10];
in[6] = input[6] - input[ 9];
in[7] = -input[7] + input[ 8];
step1[0] = -input[7] - input[ 8];
step1[1] = input[6] + input[ 9];
step1[2] = -input[5] - input[10];
step1[3] = input[4] + input[11];
step1[4] = -input[3] - input[12];
step1[5] = input[2] + input[13];
step1[6] = -input[1] - input[14];
step1[7] = input[0] + input[15];
// fdct8(step, step);
{
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
tran_high_t t0, t1, t2, t3; // needs32
tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = in[0] + in[7];
s1 = in[1] + in[6];
s2 = in[2] + in[5];
s3 = in[3] + in[4];
s4 = in[3] - in[4];
s5 = in[2] - in[5];
s6 = in[1] - in[6];
s7 = in[0] - in[7];
// fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
x3 = s0 - s3;
t0 = (x0 + x1) * cospi_16_64;
t1 = (x0 - x1) * cospi_16_64;
t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
output[15] = fdct_round_shift(t0);
output[11] = fdct_round_shift(t2);
output[7] = fdct_round_shift(t1);
output[3] = fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
t1 = (s6 + s5) * cospi_16_64;
t2 = fdct_round_shift(t0);
t3 = fdct_round_shift(t1);
// Stage 3
x0 = s4 + t2;
x1 = s4 - t2;
x2 = s7 - t3;
x3 = s7 + t3;
// Stage 4
t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
output[13] = fdct_round_shift(t0);
output[9] = fdct_round_shift(t2);
output[5] = fdct_round_shift(t1);
output[1] = fdct_round_shift(t3);
}
// step 2
temp1 = (step1[5] - step1[2]) * cospi_16_64;
temp2 = (step1[4] - step1[3]) * cospi_16_64;
step2[2] = fdct_round_shift(temp1);
step2[3] = fdct_round_shift(temp2);
temp1 = (step1[4] + step1[3]) * cospi_16_64;
temp2 = (step1[5] + step1[2]) * cospi_16_64;
step2[4] = fdct_round_shift(temp1);
step2[5] = fdct_round_shift(temp2);
// step 3
step3[0] = step1[0] + step2[3];
step3[1] = step1[1] + step2[2];
step3[2] = step1[1] - step2[2];
step3[3] = step1[0] - step2[3];
step3[4] = step1[7] - step2[4];
step3[5] = step1[6] - step2[5];
step3[6] = step1[6] + step2[5];
step3[7] = step1[7] + step2[4];
// step 4
temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
step2[1] = fdct_round_shift(temp1);
step2[2] = fdct_round_shift(temp2);
temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
step2[5] = fdct_round_shift(temp1);
step2[6] = fdct_round_shift(temp2);
// step 5
step1[0] = step3[0] + step2[1];
step1[1] = step3[0] - step2[1];
step1[2] = step3[3] + step2[2];
step1[3] = step3[3] - step2[2];
step1[4] = step3[4] - step2[5];
step1[5] = step3[4] + step2[5];
step1[6] = step3[7] - step2[6];
step1[7] = step3[7] + step2[6];
// step 6
temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
output[14] = fdct_round_shift(temp1);
output[6] = fdct_round_shift(temp2);
temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
output[10] = fdct_round_shift(temp1);
output[2] = fdct_round_shift(temp2);
temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
output[12] = fdct_round_shift(temp1);
output[4] = fdct_round_shift(temp2);
temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
output[8] = fdct_round_shift(temp1);
output[0] = fdct_round_shift(temp2);
#else
// {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
static const int sinvalue_lookup[] = {
47852167, 94074787, 137093803, 175444254,
@ -189,6 +414,7 @@ void vp9_fdst16(const tran_low_t *input, tran_low_t *output) {
d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
d69 * sinvalue_lookup[6] - d78 * sinvalue_lookup[7];
output[15] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
#endif
}
#endif // CONFIG_EXT_TX