Flip the result of the inv transform for FLIPADST.

This is a port of 4f5108090a6047d5d4d9ce1df302da23b2ef4bc5

This commit also fixes a bug where FLIPADST transforms when combined
with a DST (that is FLIPADST_DST and DST_FLIPADST) did not actually did
a flipped transform but a straight ADST instead. This was due to the C
implementation that it fell back on not implementing flipping.  This is
now fixed as well and FLIPADST_DST and DST_FLIPADST does what it is
supposed to do.

There are 3 functions in the SR_MODE experiment that should be updated,
but given that the build of SR_MODE is broken at the upstream tip of
nextgen, I could not test these, so I have put in assertions and FIXME
notes at the problematic places.

Change-Id: I5b8175b85f944f2369b183a26256e08d97f4bdef
This commit is contained in:
Geza Lore 2015-11-13 15:16:28 +00:00 committed by Debargha Mukherjee
parent f1f3a8ab14
commit 85ab9d56cc
2 changed files with 424 additions and 345 deletions

View File

@ -16,6 +16,59 @@
#include "vp9/common/vp9_idct.h"
#if CONFIG_EXT_TX
#define FLIPUD_PTR(dest, stride, size) do { \
(dest) = (dest) + ((size) - 1) * (stride); \
(stride) = - (stride); \
} while (0)
static void maybe_flip_strides(uint8_t **dst, int *dstride,
tran_low_t **src, int *sstride,
int tx_type, int size) {
// Note that the transpose of src will be added to dst. In order to LR
// flip the addends (in dst coordinates), we UD flip the src. To UD flip
// the addends, we UD flip the dst.
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
break;
case FLIPADST_DCT:
case FLIPADST_ADST:
// flip UD
FLIPUD_PTR(*dst, *dstride, size);
break;
case DCT_FLIPADST:
case ADST_FLIPADST:
// flip LR
FLIPUD_PTR(*src, *sstride, size);
break;
case FLIPADST_FLIPADST:
// flip UD
FLIPUD_PTR(*dst, *dstride, size);
// flip LR
FLIPUD_PTR(*src, *sstride, size);
break;
case DST_DST:
case DCT_DST:
case DST_DCT:
case DST_ADST:
case ADST_DST:
break;
case DST_FLIPADST:
// flip LR
FLIPUD_PTR(*src, *sstride, size);
break;
case FLIPADST_DST:
// flip UD
FLIPUD_PTR(*dst, *dstride, size);
break;
default:
assert(0);
break;
}
}
void idst4(const tran_low_t *input, tran_low_t *output) {
// {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
static const int32_t sinvalue_lookup[] = {
@ -635,25 +688,41 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
};
int i, j;
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
tran_low_t temp_in[4], temp_out[4];
tran_low_t tmp;
tran_low_t out[4][4];
tran_low_t *outp = &out[0][0];
int outstride = 4;
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
IHT_4[tx_type].rows(input, outptr);
IHT_4[tx_type].rows(input, out[i]);
input += 4;
outptr += 4;
}
// transpose
for (i = 1 ; i < 4; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out);
IHT_4[tx_type].cols(out[i], out[i]);
}
#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
#endif
// Sum with the destination
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 4));
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
}
}
}
@ -756,97 +825,44 @@ static const transform_2d IHT_8[] = {
#endif // CONFIG_EXT_TX
};
#if CONFIG_EXT_TX
void fliplr(uint8_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l; ++i) {
for (j = 0; j < l / 2; ++j) {
const uint8_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[i * stride + l - 1 - j];
dest[i * stride + l - 1 - j] = tmp;
}
}
}
void flipud(uint8_t *dest, int stride, int l) {
int i, j;
for (j = 0; j < l; ++j) {
for (i = 0; i < l / 2; ++i) {
const uint8_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
dest[(l - 1 - i) * stride + j] = tmp;
}
}
}
void fliplrud(uint8_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l / 2; ++i) {
for (j = 0; j < l; ++j) {
const uint8_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
}
}
}
void fliplr16(uint16_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l; ++i) {
for (j = 0; j < l / 2; ++j) {
const uint16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[i * stride + l - 1 - j];
dest[i * stride + l - 1 - j] = tmp;
}
}
}
void flipud16(uint16_t *dest, int stride, int l) {
int i, j;
for (j = 0; j < l; ++j) {
for (i = 0; i < l / 2; ++i) {
const uint16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
dest[(l - 1 - i) * stride + j] = tmp;
}
}
}
void fliplrud16(uint16_t *dest, int stride, int l) {
int i, j;
for (i = 0; i < l / 2; ++i) {
for (j = 0; j < l; ++j) {
const uint16_t tmp = dest[i * stride + j];
dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
}
}
}
#endif // CONFIG_EXT_TX
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = IHT_8[tx_type];
tran_low_t tmp;
tran_low_t out[8][8];
tran_low_t *outp = &out[0][0];
int outstride = 8;
// inverse transform row vectors
for (i = 0; i < 8; ++i) {
ht.rows(input, outptr);
input += 8;
outptr += 8;
IHT_8[tx_type].rows(input, out[i]);
input += 8;
}
// transpose
for (i = 1 ; i < 8; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
IHT_8[tx_type].cols(out[i], out[i]);
}
#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
#endif
// Sum with the destination
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 5));
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
}
}
}
@ -1291,26 +1307,41 @@ static const transform_2d IHT_16[] = {
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = IHT_16[tx_type];
tran_low_t tmp;
tran_low_t out[16][16];
tran_low_t *outp = &out[0][0];
int outstride = 16;
// Rows
// inverse transform row vectors
for (i = 0; i < 16; ++i) {
ht.rows(input, outptr);
input += 16;
outptr += 16;
IHT_16[tx_type].rows(input, out[i]);
input += 16;
}
// Columns
// transpose
for (i = 1 ; i < 16; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 16; ++i) {
IHT_16[tx_type].cols(out[i], out[i]);
}
#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
#endif
// Sum with the destination
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
}
}
}
@ -1911,26 +1942,6 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
} else if (tx_type == FLIPADST_DCT) {
flipud(dest, stride, 4);
vp9_iht4x4_16_add(input, dest, stride, ADST_DCT);
flipud(dest, stride, 4);
} else if (tx_type == DCT_FLIPADST) {
fliplr(dest, stride, 4);
vp9_iht4x4_16_add(input, dest, stride, DCT_ADST);
fliplr(dest, stride, 4);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud(dest, stride, 4);
vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
fliplrud(dest, stride, 4);
} else if (tx_type == ADST_FLIPADST) {
fliplr(dest, stride, 4);
vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
fliplr(dest, stride, 4);
} else if (tx_type == FLIPADST_ADST) {
flipud(dest, stride, 4);
vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
flipud(dest, stride, 4);
#endif // CONFIG_EXT_TX
} else {
vp9_iht4x4_16_add(input, dest, stride, tx_type);
@ -1944,26 +1955,6 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
} else if (tx_type == FLIPADST_DCT) {
flipud(dest, stride, 8);
vp9_iht8x8_64_add(input, dest, stride, ADST_DCT);
flipud(dest, stride, 8);
} else if (tx_type == DCT_FLIPADST) {
fliplr(dest, stride, 8);
vp9_iht8x8_64_add(input, dest, stride, DCT_ADST);
fliplr(dest, stride, 8);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud(dest, stride, 8);
vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
fliplrud(dest, stride, 8);
} else if (tx_type == ADST_FLIPADST) {
fliplr(dest, stride, 8);
vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
fliplr(dest, stride, 8);
} else if (tx_type == FLIPADST_ADST) {
flipud(dest, stride, 8);
vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
flipud(dest, stride, 8);
#endif // CONFIG_EXT_TX
} else {
vp9_iht8x8_64_add(input, dest, stride, tx_type);
@ -1977,26 +1968,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_iht16x16_256_add_c(input, dest, stride, tx_type);
} else if (tx_type == FLIPADST_DCT) {
flipud(dest, stride, 16);
vp9_iht16x16_256_add(input, dest, stride, ADST_DCT);
flipud(dest, stride, 16);
} else if (tx_type == DCT_FLIPADST) {
fliplr(dest, stride, 16);
vp9_iht16x16_256_add(input, dest, stride, DCT_ADST);
fliplr(dest, stride, 16);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud(dest, stride, 16);
vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
fliplrud(dest, stride, 16);
} else if (tx_type == ADST_FLIPADST) {
fliplr(dest, stride, 16);
vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
fliplr(dest, stride, 16);
} else if (tx_type == FLIPADST_ADST) {
flipud(dest, stride, 16);
vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
flipud(dest, stride, 16);
#endif // CONFIG_EXT_TX
} else {
vp9_iht16x16_256_add(input, dest, stride, tx_type);
@ -2775,7 +2746,7 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
const highbd_transform_2d HIGH_IHT_4[] = {
{ vp9_highbd_idct4, vp9_highbd_idct4 }, // DCT_DCT = 0
{ highbd_iadst4, vp9_highbd_idct4 }, // ADST_DCT = 1
{ vp9_highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
@ -2798,25 +2769,43 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
int i, j;
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
tran_low_t temp_in[4], temp_out[4];
tran_low_t tmp;
tran_low_t out[4][4];
tran_low_t *outp = &out[0][0];
int outstride = 4;
// Inverse transform row vectors.
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
IHT_4[tx_type].rows(input, outptr, bd);
HIGH_IHT_4[tx_type].rows(input, out[i], bd);
input += 4;
outptr += 4;
}
// Inverse transform column vectors.
// transpose
for (i = 1 ; i < 4; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 4; ++i) {
HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
}
#if CONFIG_EXT_TX
maybe_flip_strides((uint8_t**)&dest,
&stride, &outp, &outstride, tx_type, 4 * 2);
#endif
// Sum with the destination
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = highbd_clip_pixel_add(dest[d],
ROUND_POWER_OF_TWO(outp[s], 4), bd);
}
}
}
@ -2921,28 +2910,46 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Inverse transform row vectors.
int i, j;
tran_low_t tmp;
tran_low_t out[8][8];
tran_low_t *outp = &out[0][0];
int outstride = 8;
// inverse transform row vectors
for (i = 0; i < 8; ++i) {
ht.rows(input, outptr, bd);
input += 8;
outptr += 8;
HIGH_IHT_8[tx_type].rows(input, out[i], bd);
input += 8;
}
// Inverse transform column vectors.
// transpose
for (i = 1 ; i < 8; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 8; ++i) {
HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
}
#if CONFIG_EXT_TX
maybe_flip_strides((uint8_t**)&dest,
&stride, &outp, &outstride, tx_type, 8 * 2);
#endif
// Sum with the destination
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = highbd_clip_pixel_add(dest[d],
ROUND_POWER_OF_TWO(outp[s], 5), bd);
}
}
}
@ -3361,28 +3368,46 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
int i, j;
tran_low_t tmp;
tran_low_t out[16][16];
tran_low_t *outp = &out[0][0];
int outstride = 16;
// inverse transform row vectors
for (i = 0; i < 16; ++i) {
ht.rows(input, outptr, bd);
input += 16;
outptr += 16;
HIGH_IHT_16[tx_type].rows(input, out[i], bd);
input += 16;
}
// Columns
// transpose
for (i = 1 ; i < 16; i++) {
for (j = 0; j < i; j++) {
tmp = out[i][j];
out[i][j] = out[j][i];
out[j][i] = tmp;
}
}
// inverse transform column vectors
for (i = 0; i < 16; ++i) {
HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
}
#if CONFIG_EXT_TX
maybe_flip_strides((uint8_t**)&dest, &stride,
&outp, &outstride, tx_type, 16 * 2);
#endif
// Sum with the destination
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
int d = i * stride + j;
int s = j * outstride + i;
dest[d] = highbd_clip_pixel_add(dest[d],
ROUND_POWER_OF_TWO(outp[s], 6), bd);
}
}
}
@ -3954,26 +3979,6 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
} else if (tx_type == FLIPADST_DCT) {
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_DCT, bd);
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
} else if (tx_type == DCT_FLIPADST) {
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
vp9_highbd_iht4x4_16_add(input, dest, stride, DCT_ADST, bd);
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
} else if (tx_type == ADST_FLIPADST) {
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
} else if (tx_type == FLIPADST_ADST) {
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
#endif // CONFIG_EXT_TX
} else {
vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
@ -3987,26 +3992,6 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
} else if (tx_type == FLIPADST_DCT) {
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_DCT, bd);
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
} else if (tx_type == DCT_FLIPADST) {
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
vp9_highbd_iht8x8_64_add(input, dest, stride, DCT_ADST, bd);
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
} else if (tx_type == ADST_FLIPADST) {
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
} else if (tx_type == FLIPADST_ADST) {
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
#endif // CONFIG_EXT_TX
} else {
vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
@ -4020,26 +4005,6 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
} else if (tx_type == FLIPADST_DCT) {
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_DCT, bd);
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
} else if (tx_type == DCT_FLIPADST) {
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
vp9_highbd_iht16x16_256_add(input, dest, stride, DCT_ADST, bd);
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
} else if (tx_type == ADST_FLIPADST) {
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
} else if (tx_type == FLIPADST_ADST) {
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
#endif // CONFIG_EXT_TX
} else {
vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
@ -4276,6 +4241,19 @@ void vp9_iht4x4_16_c(const tran_low_t *input, int16_t *dest, int stride,
tran_low_t *outptr = out;
tran_low_t temp_in[4], temp_out[4];
// FIXME: If the SR_MODE experiment is resurrected, then this function must
// be fixed to handle the FLIPADST cases by actually flipping its output
// See the other vp9_iht*add_c functions
#if CONFIG_EXT_TX
assert(tx_type != FLIPADST_DCT);
assert(tx_type != DCT_FLIPADST);
assert(tx_type != FLIPADST_FLIPADST);
assert(tx_type != ADST_FLIPADST);
assert(tx_type != FLIPADST_ADST);
assert(tx_type != DST_FLIPADST);
assert(tx_type != FLIPADST_DST);
#endif // CONFIG_EXT_TX
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
IHT_4[tx_type].rows(input, outptr);
@ -4302,6 +4280,19 @@ void vp9_iht8x8_64_c(const tran_low_t *input, int16_t *dest, int stride,
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = IHT_8[tx_type];
// FIXME: If the SR_MODE experiment is resurrected, then this function must
// be fixed to handle the FLIPADST cases by actually flipping its output
// See the other vp9_iht*add_c functions
#if CONFIG_EXT_TX
assert(tx_type != FLIPADST_DCT);
assert(tx_type != DCT_FLIPADST);
assert(tx_type != FLIPADST_FLIPADST);
assert(tx_type != ADST_FLIPADST);
assert(tx_type != FLIPADST_ADST);
assert(tx_type != DST_FLIPADST);
assert(tx_type != FLIPADST_DST);
#endif // CONFIG_EXT_TX
// inverse transform row vectors
for (i = 0; i < 8; ++i) {
ht.rows(input, outptr);
@ -4378,6 +4369,19 @@ void vp9_iht16x16_256_c(const tran_low_t *input, int16_t *dest, int stride,
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = IHT_16[tx_type];
// FIXME: If the SR_MODE experiment is resurrected, then this function must
// be fixed to handle the FLIPADST cases by actually flipping its output
// See the other vp9_iht*add_c functions
#if CONFIG_EXT_TX
assert(tx_type != FLIPADST_DCT);
assert(tx_type != DCT_FLIPADST);
assert(tx_type != FLIPADST_FLIPADST);
assert(tx_type != ADST_FLIPADST);
assert(tx_type != FLIPADST_ADST);
assert(tx_type != DST_FLIPADST);
assert(tx_type != FLIPADST_DST);
#endif // CONFIG_EXT_TX
// Rows
for (i = 0; i < 16; ++i) {
ht.rows(input, outptr);
@ -4582,26 +4586,6 @@ void vp9_iht4x4(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_iht4x4_16_c(input, dest, stride, tx_type);
} else if (tx_type == FLIPADST_DCT) {
flipud(dest, stride, 4);
vp9_iht4x4_16(input, dest, stride, ADST_DCT);
flipud(dest, stride, 4);
} else if (tx_type == DCT_FLIPADST) {
fliplr(dest, stride, 4);
vp9_iht4x4_16(input, dest, stride, DCT_ADST);
fliplr(dest, stride, 4);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud(dest, stride, 4);
vp9_iht4x4_16(input, dest, stride, ADST_ADST);
fliplrud(dest, stride, 4);
} else if (tx_type == ADST_FLIPADST) {
fliplr(dest, stride, 4);
vp9_iht4x4_16(input, dest, stride, ADST_ADST);
fliplr(dest, stride, 4);
} else if (tx_type == FLIPADST_ADST) {
flipud(dest, stride, 4);
vp9_iht4x4_16(input, dest, stride, ADST_ADST);
flipud(dest, stride, 4);
#endif // CONFIG_EXT_TX
} else {
vp9_iht4x4_16(input, dest, stride, tx_type);
@ -4615,26 +4599,6 @@ void vp9_iht8x8(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_iht8x8_64_c(input, dest, stride, tx_type);
} else if (tx_type == FLIPADST_DCT) {
flipud(dest, stride, 8);
vp9_iht8x8_64(input, dest, stride, ADST_DCT);
flipud(dest, stride, 8);
} else if (tx_type == DCT_FLIPADST) {
fliplr(dest, stride, 8);
vp9_iht8x8_64(input, dest, stride, DCT_ADST);
fliplr(dest, stride, 8);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud(dest, stride, 8);
vp9_iht8x8_64(input, dest, stride, ADST_ADST);
fliplrud(dest, stride, 8);
} else if (tx_type == ADST_FLIPADST) {
fliplr(dest, stride, 8);
vp9_iht8x8_64(input, dest, stride, ADST_ADST);
fliplr(dest, stride, 8);
} else if (tx_type == FLIPADST_ADST) {
flipud(dest, stride, 8);
vp9_iht8x8_64(input, dest, stride, ADST_ADST);
flipud(dest, stride, 8);
#endif // CONFIG_EXT_TX
} else {
vp9_iht8x8_64(input, dest, stride, tx_type);
@ -4648,26 +4612,6 @@ void vp9_iht16x16(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
#if CONFIG_EXT_TX
} else if (is_dst_used(tx_type)) {
vp9_iht16x16_256_c(input, dest, stride, tx_type);
} else if (tx_type == FLIPADST_DCT) {
flipud(dest, stride, 16);
vp9_iht16x16_256(input, dest, stride, ADST_DCT);
flipud(dest, stride, 16);
} else if (tx_type == DCT_FLIPADST) {
fliplr(dest, stride, 16);
vp9_iht16x16_256(input, dest, stride, DCT_ADST);
fliplr(dest, stride, 16);
} else if (tx_type == FLIPADST_FLIPADST) {
fliplrud(dest, stride, 16);
vp9_iht16x16_256(input, dest, stride, ADST_ADST);
fliplrud(dest, stride, 16);
} else if (tx_type == ADST_FLIPADST) {
fliplr(dest, stride, 16);
vp9_iht16x16_256(input, dest, stride, ADST_ADST);
fliplr(dest, stride, 16);
} else if (tx_type == FLIPADST_ADST) {
flipud(dest, stride, 16);
vp9_iht16x16_256(input, dest, stride, ADST_ADST);
flipud(dest, stride, 16);
#endif // CONFIG_EXT_TX
} else {
vp9_iht16x16_256(input, dest, stride, tx_type);

View File

@ -11,6 +11,55 @@
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_enums.h"
#if CONFIG_EXT_TX
// Reverse the 8 16 bit words in __m128i
static INLINE __m128i mm_reverse_epi16(const __m128i x) {
const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
return _mm_shuffle_epi32(b, 0x4e);
}
static INLINE void fliplr_4x4(__m128i in[2]) {
in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
}
static INLINE void fliplr_8x8(__m128i in[8]) {
in[0] = mm_reverse_epi16(in[0]);
in[1] = mm_reverse_epi16(in[1]);
in[2] = mm_reverse_epi16(in[2]);
in[3] = mm_reverse_epi16(in[3]);
in[4] = mm_reverse_epi16(in[4]);
in[5] = mm_reverse_epi16(in[5]);
in[6] = mm_reverse_epi16(in[6]);
in[7] = mm_reverse_epi16(in[7]);
}
static INLINE void fliplr_16x8(__m128i in[16]) {
fliplr_8x8(&in[0]);
fliplr_8x8(&in[8]);
}
#define FLIPLR_16x16(in0, in1) do { \
__m128i *tmp; \
fliplr_16x8(in0); \
fliplr_16x8(in1); \
tmp = (in0); \
(in0) = (in1); \
(in1) = tmp; \
} while (0)
#define FLIPUD_PTR(dest, stride, size) do { \
(dest) = (dest) + ((size) - 1) * (stride); \
(stride) = - (stride); \
} while (0)
#endif
#define RECON_AND_STORE4X4(dest, in_x) \
{ \
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
@ -126,12 +175,12 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// Reconstruction and Store
{
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
__m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
__m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
d0 = _mm_unpacklo_epi32(d0,
_mm_cvtsi32_si128(*(const int *) (dest + stride)));
d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
*(const int *) (dest + stride * 3)), d2);
__m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
d0 = _mm_unpacklo_epi32(d0, d1);
d2 = _mm_unpacklo_epi32(d3, d2);
d0 = _mm_unpacklo_epi8(d0, zero);
d2 = _mm_unpacklo_epi8(d2, zero);
d0 = _mm_add_epi16(d0, input2);
@ -271,22 +320,50 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
switch (tx_type) {
case 0: // DCT_DCT
case DCT_DCT:
idct4_sse2(in);
idct4_sse2(in);
break;
case 1: // ADST_DCT
case ADST_DCT:
idct4_sse2(in);
iadst4_sse2(in);
break;
case 2: // DCT_ADST
case DCT_ADST:
iadst4_sse2(in);
idct4_sse2(in);
break;
case 3: // ADST_ADST
case ADST_ADST:
iadst4_sse2(in);
iadst4_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
idct4_sse2(in);
iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
case DCT_FLIPADST:
iadst4_sse2(in);
idct4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_FLIPADST:
iadst4_sse2(in);
iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
fliplr_4x4(in);
break;
case ADST_FLIPADST:
iadst4_sse2(in);
iadst4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_ADST:
iadst4_sse2(in);
iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
@ -875,22 +952,50 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
switch (tx_type) {
case 0: // DCT_DCT
case DCT_DCT:
idct8_sse2(in);
idct8_sse2(in);
break;
case 1: // ADST_DCT
case ADST_DCT:
idct8_sse2(in);
iadst8_sse2(in);
break;
case 2: // DCT_ADST
case DCT_ADST:
iadst8_sse2(in);
idct8_sse2(in);
break;
case 3: // ADST_ADST
case ADST_ADST:
iadst8_sse2(in);
iadst8_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
idct8_sse2(in);
iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
break;
case DCT_FLIPADST:
iadst8_sse2(in);
idct8_sse2(in);
fliplr_8x8(in);
break;
case FLIPADST_FLIPADST:
iadst8_sse2(in);
iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
fliplr_8x8(in);
break;
case ADST_FLIPADST:
iadst8_sse2(in);
iadst8_sse2(in);
fliplr_8x8(in);
break;
case FLIPADST_ADST:
iadst8_sse2(in);
iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
@ -2331,29 +2436,59 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) {
void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in0[16], in1[16];
__m128i in[32];
__m128i *in0 = &in[0];
__m128i *in1 = &in[16];
load_buffer_8x16(input, in0);
input += 8;
load_buffer_8x16(input, in1);
switch (tx_type) {
case 0: // DCT_DCT
case DCT_DCT:
idct16_sse2(in0, in1);
idct16_sse2(in0, in1);
break;
case 1: // ADST_DCT
case ADST_DCT:
idct16_sse2(in0, in1);
iadst16_sse2(in0, in1);
break;
case 2: // DCT_ADST
case DCT_ADST:
iadst16_sse2(in0, in1);
idct16_sse2(in0, in1);
break;
case 3: // ADST_ADST
case ADST_ADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
idct16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
break;
case DCT_FLIPADST:
iadst16_sse2(in0, in1);
idct16_sse2(in0, in1);
FLIPLR_16x16(in0, in1);
break;
case FLIPADST_FLIPADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
FLIPLR_16x16(in0, in1);
break;
case ADST_FLIPADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPLR_16x16(in0, in1);
break;
case FLIPADST_ADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;