Merge "Eliminate copying for FLIPADST in fwd transforms." into nextgenv2

This commit is contained in:
Debargha Mukherjee
2015-11-06 08:37:25 +00:00
committed by Gerrit Code Review
3 changed files with 366 additions and 369 deletions

View File

@@ -1161,6 +1161,106 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
output[15] = (tran_low_t)-x1;
}
#if CONFIG_EXT_TX
static void copy_block(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
int i;
for (i = 0; i < l; ++i) {
memcpy(dest + dest_stride * i, src + src_stride * i,
l * sizeof(int16_t));
}
}
static void fliplr(int16_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l; ++i) {
for (j = 0; j < l / 2; ++j) {
const int16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[i * stride + l - 1 - j];
dest[i * stride + l - 1 - j] = tmp;
}
}
}
static void flipud(int16_t *dest, int stride, int l) {
int i, j;
for (j = 0; j < l; ++j) {
for (i = 0; i < l / 2; ++i) {
const int16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
dest[(l - 1 - i) * stride + j] = tmp;
}
}
}
static void fliplrud(int16_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l / 2; ++i) {
for (j = 0; j < l; ++j) {
const int16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
}
}
}
static void copy_fliplr(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
copy_block(src, src_stride, l, dest, dest_stride);
fliplr(dest, dest_stride, l);
}
static void copy_flipud(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
copy_block(src, src_stride, l, dest, dest_stride);
flipud(dest, dest_stride, l);
}
static void copy_fliplrud(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
copy_block(src, src_stride, l, dest, dest_stride);
fliplrud(dest, dest_stride, l);
}
static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
int16_t *buff, int tx_type) {
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
break;
case FLIPADST_DCT:
case FLIPADST_ADST:
case FLIPADST_DST:
copy_flipud(*src, *src_stride, l, buff, l);
*src = buff;
*src_stride = l;
break;
case DCT_FLIPADST:
case ADST_FLIPADST:
case DST_FLIPADST:
copy_fliplr(*src, *src_stride, l, buff, l);
*src = buff;
*src_stride = l;
break;
case FLIPADST_FLIPADST:
copy_fliplrud(*src, *src_stride, l, buff, l);
*src = buff;
*src_stride = l;
break;
default:
assert(0);
break;
}
}
#endif // CONFIG_EXT_TX
static const transform_2d FHT_4[] = {
{ fdct4, fdct4 }, // DCT_DCT = 0,
{ fadst4, fdct4 }, // ADST_DCT = 1,
@@ -1234,6 +1334,11 @@ void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
#if CONFIG_EXT_TX
int16_t flipped_input[4 * 4];
maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
#endif
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
@@ -1378,6 +1483,11 @@ void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
#if CONFIG_EXT_TX
int16_t flipped_input[8 * 8];
maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
#endif
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
@@ -1464,6 +1574,11 @@ void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
#if CONFIG_EXT_TX
int16_t flipped_input[16 * 16];
maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
#endif
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)

View File

@@ -326,66 +326,6 @@ static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EXT_TX
static void copy_block(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
int i;
for (i = 0; i < l; ++i) {
memcpy(dest + dest_stride * i, src + src_stride * i,
l * sizeof(int16_t));
}
}
static void fliplr(int16_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l; ++i) {
for (j = 0; j < l / 2; ++j) {
const int16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[i * stride + l - 1 - j];
dest[i * stride + l - 1 - j] = tmp;
}
}
}
static void flipud(int16_t *dest, int stride, int l) {
int i, j;
for (j = 0; j < l; ++j) {
for (i = 0; i < l / 2; ++i) {
const int16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
dest[(l - 1 - i) * stride + j] = tmp;
}
}
}
static void fliplrud(int16_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l / 2; ++i) {
for (j = 0; j < l; ++j) {
const int16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
}
}
}
static void copy_fliplr(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
copy_block(src, src_stride, l, dest, dest_stride);
fliplr(dest, dest_stride, l);
}
static void copy_flipud(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
copy_block(src, src_stride, l, dest, dest_stride);
flipud(dest, dest_stride, l);
}
static void copy_fliplrud(const int16_t *src, int src_stride, int l,
int16_t *dest, int dest_stride) {
copy_block(src, src_stride, l, dest, dest_stride);
fliplrud(dest, dest_stride, l);
}
// Forward identity transform.
static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
int bs) {
@@ -404,15 +344,13 @@ static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int lossless) {
if (lossless) {
assert(tx_type == DCT_DCT);
vp10_fwht4x4(src_diff, coeff, diff_stride);
} else {
#if CONFIG_EXT_TX
int16_t src_diff2[16];
#endif // CONFIG_EXT_TX
return;
}
switch (tx_type) {
case DCT_DCT:
vpx_fdct4x4(src_diff, coeff, diff_stride);
break;
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
@@ -420,41 +358,22 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4(src_diff2, coeff, 4, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4(src_diff2, coeff, 4, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4_c(src_diff2, coeff, 4, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_fht4x4_c(src_diff2, coeff, 4, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 4);
break;
@@ -462,15 +381,11 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
default:
assert(0);
break;
}
}
}
static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[64];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
@@ -480,41 +395,22 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 8);
break;
@@ -527,9 +423,6 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
static void fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[64];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
@@ -539,56 +432,34 @@ static void fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 8);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
default:
assert(0);
break;
}
}
static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[256];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
@@ -598,41 +469,22 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 16);
break;
@@ -645,9 +497,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
static void fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[256];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
@@ -657,41 +506,22 @@ static void fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 16);
break;
@@ -754,76 +584,48 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
if (lossless) {
assert(tx_type == DCT_DCT);
vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
} else {
#if CONFIG_EXT_TX
int16_t src_diff2[16];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
break;
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
break;
return;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4(src_diff2, coeff, 4, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4_c(src_diff2, coeff, 4, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
vp10_highbd_fht4x4_c(src_diff2, coeff, 4, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 4);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
}
default:
assert(0);
break;
}
}
static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[64];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
break;
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
@@ -831,41 +633,22 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 8);
break;
@@ -878,13 +661,8 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[64];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
break;
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
@@ -892,41 +670,22 @@ static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
vp10_highbd_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 8);
break;
@@ -939,13 +698,8 @@ static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[256];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
break;
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
@@ -953,41 +707,22 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 16);
break;
@@ -1000,13 +735,8 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
static void highbd_fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type) {
#if CONFIG_EXT_TX
int16_t src_diff2[256];
#endif // CONFIG_EXT_TX
switch (tx_type) {
case DCT_DCT:
vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
break;
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
@@ -1014,41 +744,22 @@ static void highbd_fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
break;
case DCT_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
break;
case FLIPADST_FLIPADST:
copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case ADST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
break;
case FLIPADST_ADST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
case DST_FLIPADST:
case FLIPADST_DST:
// Use C version since DST exists only in C
vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
break;
case DST_FLIPADST:
copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
break;
case FLIPADST_DST:
copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
vp10_highbd_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
break;
case IDTX:
fwd_idtx_c(src_diff, coeff, diff_stride, 16);
break;

View File

@@ -18,16 +18,37 @@
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
// Reverse the 8 16 bit words in __m128i
static INLINE __m128i mm_reverse_epi16(const __m128i x) {
const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
return _mm_shuffle_epi32(b, 0x4e);
}
static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
int stride) {
int stride, int flipud, int fliplr) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i mask;
in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
if (!flipud) {
in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
} else {
in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
}
if (fliplr) {
in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
}
in[0] = _mm_slli_epi16(in[0], 4);
in[1] = _mm_slli_epi16(in[1], 4);
@@ -160,23 +181,55 @@ void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
vpx_fdct4x4_sse2(input, output, stride);
break;
case ADST_DCT:
load_buffer_4x4(input, in, stride);
load_buffer_4x4(input, in, stride, 0, 0);
fadst4_sse2(in);
fdct4_sse2(in);
write_buffer_4x4(output, in);
break;
case DCT_ADST:
load_buffer_4x4(input, in, stride);
load_buffer_4x4(input, in, stride, 0, 0);
fdct4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
case ADST_ADST:
load_buffer_4x4(input, in, stride);
load_buffer_4x4(input, in, stride, 0, 0);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
load_buffer_4x4(input, in, stride, 1, 0);
fadst4_sse2(in);
fdct4_sse2(in);
write_buffer_4x4(output, in);
break;
case DCT_FLIPADST:
load_buffer_4x4(input, in, stride, 0, 1);
fdct4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
case FLIPADST_FLIPADST:
load_buffer_4x4(input, in, stride, 1, 1);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
case ADST_FLIPADST:
load_buffer_4x4(input, in, stride, 0, 1);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
case FLIPADST_ADST:
load_buffer_4x4(input, in, stride, 1, 0);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
@@ -627,15 +680,37 @@ void vp10_fdct8x8_quant_sse2(const int16_t *input, int stride,
// load 8x8 array
static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
int stride) {
in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
int stride, int flipud, int fliplr) {
if (!flipud) {
in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
} else {
in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
}
if (fliplr) {
in[0] = mm_reverse_epi16(in[0]);
in[1] = mm_reverse_epi16(in[1]);
in[2] = mm_reverse_epi16(in[2]);
in[3] = mm_reverse_epi16(in[3]);
in[4] = mm_reverse_epi16(in[4]);
in[5] = mm_reverse_epi16(in[5]);
in[6] = mm_reverse_epi16(in[6]);
in[7] = mm_reverse_epi16(in[7]);
}
in[0] = _mm_slli_epi16(in[0], 2);
in[1] = _mm_slli_epi16(in[1], 2);
@@ -1144,26 +1219,63 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
vpx_fdct8x8_sse2(input, output, stride);
break;
case ADST_DCT:
load_buffer_8x8(input, in, stride);
load_buffer_8x8(input, in, stride, 0, 0);
fadst8_sse2(in);
fdct8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case DCT_ADST:
load_buffer_8x8(input, in, stride);
load_buffer_8x8(input, in, stride, 0, 0);
fdct8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case ADST_ADST:
load_buffer_8x8(input, in, stride);
load_buffer_8x8(input, in, stride, 0, 0);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
load_buffer_8x8(input, in, stride, 1, 0);
fadst8_sse2(in);
fdct8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case DCT_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1);
fdct8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case FLIPADST_FLIPADST:
load_buffer_8x8(input, in, stride, 1, 1);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case ADST_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case FLIPADST_ADST:
load_buffer_8x8(input, in, stride, 1, 0);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
@@ -1171,15 +1283,37 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
}
static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
__m128i *in1, int stride) {
// load first 8 columns
load_buffer_8x8(input, in0, stride);
load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
__m128i *in1, int stride,
int flipud, int fliplr) {
// Load 4 8x8 blocks
const int16_t *topL = input;
const int16_t *topR = input + 8;
const int16_t *botL = input + 8 * stride;
const int16_t *botR = input + 8 * stride + 8;
const int16_t *tmp;
if (flipud) {
// Swap left columns
tmp = topL; topL = botL; botL = tmp;
// Swap right columns
tmp = topR; topR = botR; botR = tmp;
}
if (fliplr) {
// Swap top rows
tmp = topL; topL = topR; topR = tmp;
// Swap bottom rows
tmp = botL; botL = botR; botR = tmp;
}
// load first 8 columns
load_buffer_8x8(topL, in0, stride, flipud, fliplr);
load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
input += 8;
// load second 8 columns
load_buffer_8x8(input, in1, stride);
load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
load_buffer_8x8(topR, in1, stride, flipud, fliplr);
load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
}
static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
@@ -2031,26 +2165,63 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
vpx_fdct16x16_sse2(input, output, stride);
break;
case ADST_DCT:
load_buffer_16x16(input, in0, in1, stride);
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case DCT_ADST:
load_buffer_16x16(input, in0, in1, stride);
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case ADST_ADST:
load_buffer_16x16(input, in0, in1, stride);
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
load_buffer_16x16(input, in0, in1, stride, 1, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case DCT_FLIPADST:
load_buffer_16x16(input, in0, in1, stride, 0, 1);
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, in0, in1, stride, 1, 1);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case ADST_FLIPADST:
load_buffer_16x16(input, in0, in1, stride, 0, 1);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case FLIPADST_ADST:
load_buffer_16x16(input, in0, in1, stride, 1, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;