Merge "Integrate HBD inverse HT flip types sse4.1 optimization" into nextgenv2
This commit is contained in:
@@ -15,6 +15,7 @@
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/util.h"
|
||||
#include "vp10/common/enums.h"
|
||||
#include "vpx_dsp/vpx_dsp_common.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
@@ -149,32 +150,68 @@ using std::tr1::make_tuple;
|
||||
|
||||
const IHbdHtParam kArrayIhtParam[] = {
|
||||
// 16x16
|
||||
make_tuple(PARAM_LIST_16X16, 0, 10),
|
||||
make_tuple(PARAM_LIST_16X16, 0, 12),
|
||||
make_tuple(PARAM_LIST_16X16, 1, 10),
|
||||
make_tuple(PARAM_LIST_16X16, 1, 12),
|
||||
make_tuple(PARAM_LIST_16X16, 2, 10),
|
||||
make_tuple(PARAM_LIST_16X16, 2, 12),
|
||||
make_tuple(PARAM_LIST_16X16, 3, 10),
|
||||
make_tuple(PARAM_LIST_16X16, 3, 12),
|
||||
make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
|
||||
make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
|
||||
make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
|
||||
make_tuple(PARAM_LIST_16X16, ADST_DCT, 12),
|
||||
make_tuple(PARAM_LIST_16X16, DCT_ADST, 10),
|
||||
make_tuple(PARAM_LIST_16X16, DCT_ADST, 12),
|
||||
make_tuple(PARAM_LIST_16X16, ADST_ADST, 10),
|
||||
make_tuple(PARAM_LIST_16X16, ADST_ADST, 12),
|
||||
#if CONFIG_EXT_TX
|
||||
make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 10),
|
||||
make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 12),
|
||||
make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
|
||||
make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
|
||||
#endif
|
||||
// 8x8
|
||||
make_tuple(PARAM_LIST_8X8, 0, 10),
|
||||
make_tuple(PARAM_LIST_8X8, 0, 12),
|
||||
make_tuple(PARAM_LIST_8X8, 1, 10),
|
||||
make_tuple(PARAM_LIST_8X8, 1, 12),
|
||||
make_tuple(PARAM_LIST_8X8, 2, 10),
|
||||
make_tuple(PARAM_LIST_8X8, 2, 12),
|
||||
make_tuple(PARAM_LIST_8X8, 3, 10),
|
||||
make_tuple(PARAM_LIST_8X8, 3, 12),
|
||||
make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
|
||||
make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
|
||||
make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
|
||||
make_tuple(PARAM_LIST_8X8, ADST_DCT, 12),
|
||||
make_tuple(PARAM_LIST_8X8, DCT_ADST, 10),
|
||||
make_tuple(PARAM_LIST_8X8, DCT_ADST, 12),
|
||||
make_tuple(PARAM_LIST_8X8, ADST_ADST, 10),
|
||||
make_tuple(PARAM_LIST_8X8, ADST_ADST, 12),
|
||||
#if CONFIG_EXT_TX
|
||||
make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 10),
|
||||
make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 12),
|
||||
make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
|
||||
make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
|
||||
#endif
|
||||
// 4x4
|
||||
make_tuple(PARAM_LIST_4X4, 0, 10),
|
||||
make_tuple(PARAM_LIST_4X4, 0, 12),
|
||||
make_tuple(PARAM_LIST_4X4, 1, 10),
|
||||
make_tuple(PARAM_LIST_4X4, 1, 12),
|
||||
make_tuple(PARAM_LIST_4X4, 2, 10),
|
||||
make_tuple(PARAM_LIST_4X4, 2, 12),
|
||||
make_tuple(PARAM_LIST_4X4, 3, 10),
|
||||
make_tuple(PARAM_LIST_4X4, 3, 12),
|
||||
make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
|
||||
make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
|
||||
make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
|
||||
make_tuple(PARAM_LIST_4X4, ADST_DCT, 12),
|
||||
make_tuple(PARAM_LIST_4X4, DCT_ADST, 10),
|
||||
make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
|
||||
make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
|
||||
make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
|
||||
#if CONFIG_EXT_TX
|
||||
make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
|
||||
make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
|
||||
make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10),
|
||||
make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
|
||||
make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
|
||||
make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
|
||||
#endif
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
|
||||
@@ -1297,7 +1297,7 @@ void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
|
||||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
vp10_inv_txfm2d_add_4x4_c(input, CONVERT_TO_SHORTPTR(dest), stride,
|
||||
vp10_inv_txfm2d_add_4x4(input, CONVERT_TO_SHORTPTR(dest), stride,
|
||||
tx_type, bd);
|
||||
break;
|
||||
case V_DCT:
|
||||
@@ -1337,7 +1337,7 @@ void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
|
||||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
vp10_inv_txfm2d_add_8x8_c(input, CONVERT_TO_SHORTPTR(dest), stride,
|
||||
vp10_inv_txfm2d_add_8x8(input, CONVERT_TO_SHORTPTR(dest), stride,
|
||||
tx_type, bd);
|
||||
break;
|
||||
case V_DCT:
|
||||
@@ -1377,7 +1377,7 @@ void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
|
||||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
vp10_inv_txfm2d_add_16x16_c(input, CONVERT_TO_SHORTPTR(dest), stride,
|
||||
vp10_inv_txfm2d_add_16x16(input, CONVERT_TO_SHORTPTR(dest), stride,
|
||||
tx_type, bd);
|
||||
break;
|
||||
case V_DCT:
|
||||
|
||||
@@ -176,7 +176,7 @@ static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
|
||||
}
|
||||
|
||||
static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
|
||||
int shift, int bd) {
|
||||
int fliplr, int flipud, int shift, int bd) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i u0, u1, u2, u3;
|
||||
__m128i v0, v1, v2, v3;
|
||||
@@ -193,10 +193,24 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
|
||||
v2 = _mm_unpacklo_epi16(v2, zero);
|
||||
v3 = _mm_unpacklo_epi16(v3, zero);
|
||||
|
||||
if (fliplr) {
|
||||
in[0] = _mm_shuffle_epi32(in[0], 0x1B);
|
||||
in[1] = _mm_shuffle_epi32(in[1], 0x1B);
|
||||
in[2] = _mm_shuffle_epi32(in[2], 0x1B);
|
||||
in[3] = _mm_shuffle_epi32(in[3], 0x1B);
|
||||
}
|
||||
|
||||
if (flipud) {
|
||||
u0 = _mm_add_epi32(in[3], v0);
|
||||
u1 = _mm_add_epi32(in[2], v1);
|
||||
u2 = _mm_add_epi32(in[1], v2);
|
||||
u3 = _mm_add_epi32(in[0], v3);
|
||||
} else {
|
||||
u0 = _mm_add_epi32(in[0], v0);
|
||||
u1 = _mm_add_epi32(in[1], v1);
|
||||
u2 = _mm_add_epi32(in[2], v2);
|
||||
u3 = _mm_add_epi32(in[3], v3);
|
||||
}
|
||||
|
||||
v0 = _mm_packus_epi32(u0, u1);
|
||||
v2 = _mm_packus_epi32(u2, u3);
|
||||
@@ -226,29 +240,66 @@ void vp10_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
load_buffer_4x4(coeff, in);
|
||||
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_DCT:
|
||||
cfg = &inv_txfm_2d_cfg_adst_dct_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case DCT_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_dct_adst_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
#if CONFIG_EXT_TX
|
||||
case FLIPADST_DCT:
|
||||
cfg = &inv_txfm_2d_cfg_adst_dct_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
case DCT_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_dct_adst_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case FLIPADST_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case FLIPADST_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_4;
|
||||
load_buffer_4x4(coeff, in);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||
iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||
write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
#endif // CONFIG_EXT_TX
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
@@ -576,12 +627,33 @@ static void round_shift_8x8(__m128i *in , int shift) {
|
||||
round_shift_4x4(&in[12], shift);
|
||||
}
|
||||
|
||||
static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
|
||||
int shift, int bd) {
|
||||
static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo,
|
||||
__m128i res_hi, int fliplr, int bd) {
|
||||
__m128i x0, x1;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
x0 = _mm_unpacklo_epi16(pred, zero);
|
||||
x1 = _mm_unpackhi_epi16(pred, zero);
|
||||
|
||||
if (fliplr) {
|
||||
res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
|
||||
res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
|
||||
x0 = _mm_add_epi32(res_hi, x0);
|
||||
x1 = _mm_add_epi32(res_lo, x1);
|
||||
|
||||
} else {
|
||||
x0 = _mm_add_epi32(res_lo, x0);
|
||||
x1 = _mm_add_epi32(res_hi, x1);
|
||||
}
|
||||
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
return highbd_clamp_epi16(x0, bd);
|
||||
}
|
||||
|
||||
static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
|
||||
int fliplr, int flipud, int shift, int bd) {
|
||||
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
__m128i x0, x1;
|
||||
|
||||
round_shift_8x8(in, shift);
|
||||
|
||||
@@ -594,61 +666,25 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
|
||||
v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
|
||||
v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v0, zero);
|
||||
x1 = _mm_unpackhi_epi16(v0, zero);
|
||||
x0 = _mm_add_epi32(in[0], x0);
|
||||
x1 = _mm_add_epi32(in[1], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u0 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v1, zero);
|
||||
x1 = _mm_unpackhi_epi16(v1, zero);
|
||||
x0 = _mm_add_epi32(in[2], x0);
|
||||
x1 = _mm_add_epi32(in[3], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u1 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v2, zero);
|
||||
x1 = _mm_unpackhi_epi16(v2, zero);
|
||||
x0 = _mm_add_epi32(in[4], x0);
|
||||
x1 = _mm_add_epi32(in[5], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u2 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v3, zero);
|
||||
x1 = _mm_unpackhi_epi16(v3, zero);
|
||||
x0 = _mm_add_epi32(in[6], x0);
|
||||
x1 = _mm_add_epi32(in[7], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u3 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v4, zero);
|
||||
x1 = _mm_unpackhi_epi16(v4, zero);
|
||||
x0 = _mm_add_epi32(in[8], x0);
|
||||
x1 = _mm_add_epi32(in[9], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u4 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v5, zero);
|
||||
x1 = _mm_unpackhi_epi16(v5, zero);
|
||||
x0 = _mm_add_epi32(in[10], x0);
|
||||
x1 = _mm_add_epi32(in[11], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u5 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v6, zero);
|
||||
x1 = _mm_unpackhi_epi16(v6, zero);
|
||||
x0 = _mm_add_epi32(in[12], x0);
|
||||
x1 = _mm_add_epi32(in[13], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u6 = highbd_clamp_epi16(x0, bd);
|
||||
|
||||
x0 = _mm_unpacklo_epi16(v7, zero);
|
||||
x1 = _mm_unpackhi_epi16(v7, zero);
|
||||
x0 = _mm_add_epi32(in[14], x0);
|
||||
x1 = _mm_add_epi32(in[15], x1);
|
||||
x0 = _mm_packus_epi32(x0, x1);
|
||||
u7 = highbd_clamp_epi16(x0, bd);
|
||||
if (flipud) {
|
||||
u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
|
||||
u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
|
||||
u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
|
||||
u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
|
||||
u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
|
||||
u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
|
||||
u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
|
||||
u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
|
||||
} else {
|
||||
u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
|
||||
u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
|
||||
u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
|
||||
u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
|
||||
u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
|
||||
u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
|
||||
u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
|
||||
u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i *)(output + 0 * stride), u0);
|
||||
_mm_store_si128((__m128i *)(output + 1 * stride), u1);
|
||||
@@ -673,7 +709,7 @@ void vp10_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case DCT_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_dct_adst_8;
|
||||
@@ -682,7 +718,7 @@ void vp10_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_DCT:
|
||||
cfg = &inv_txfm_2d_cfg_adst_dct_8;
|
||||
@@ -691,7 +727,7 @@ void vp10_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_8;
|
||||
@@ -700,8 +736,55 @@ void vp10_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
#if CONFIG_EXT_TX
|
||||
case FLIPADST_DCT:
|
||||
cfg = &inv_txfm_2d_cfg_adst_dct_8;
|
||||
load_buffer_8x8(coeff, in);
|
||||
transpose_8x8(in, out);
|
||||
idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
case DCT_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_dct_adst_8;
|
||||
load_buffer_8x8(coeff, in);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_8;
|
||||
load_buffer_8x8(coeff, in);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case FLIPADST_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_8;
|
||||
load_buffer_8x8(coeff, in);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
case FLIPADST_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_8;
|
||||
load_buffer_8x8(coeff, in);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
transpose_8x8(in, out);
|
||||
iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
#endif // CONFIG_EXT_TX
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
@@ -725,25 +808,46 @@ static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
|
||||
}
|
||||
}
|
||||
|
||||
static void swap_addr(uint16_t **output1, uint16_t **output2) {
|
||||
uint16_t *tmp;
|
||||
tmp = *output1;
|
||||
*output1 = *output2;
|
||||
*output2 = tmp;
|
||||
}
|
||||
|
||||
static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
|
||||
int shift, int bd) {
|
||||
int fliplr, int flipud, int shift, int bd) {
|
||||
__m128i in8x8[16];
|
||||
uint16_t *leftUp = &output[0];
|
||||
uint16_t *rightUp = &output[8];
|
||||
uint16_t *leftDown = &output[8 * stride];
|
||||
uint16_t *rightDown = &output[8 * stride + 8];
|
||||
|
||||
if (fliplr) {
|
||||
swap_addr(&leftUp, &rightUp);
|
||||
swap_addr(&leftDown, &rightDown);
|
||||
}
|
||||
|
||||
if (flipud) {
|
||||
swap_addr(&leftUp, &leftDown);
|
||||
swap_addr(&rightUp, &rightDown);
|
||||
}
|
||||
|
||||
// Left-up quarter
|
||||
assign_8x8_input_from_16x16(in, in8x8, 0);
|
||||
write_buffer_8x8(in8x8, &output[0], stride, shift, bd);
|
||||
write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
|
||||
|
||||
// Right-up quarter
|
||||
assign_8x8_input_from_16x16(in, in8x8, 2);
|
||||
write_buffer_8x8(in8x8, &output[8], stride, shift, bd);
|
||||
write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
|
||||
|
||||
// Left-down quarter
|
||||
assign_8x8_input_from_16x16(in, in8x8, 32);
|
||||
write_buffer_8x8(in8x8, &output[8 * stride], stride, shift, bd);
|
||||
write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
|
||||
|
||||
// Right-down quarter
|
||||
assign_8x8_input_from_16x16(in, in8x8, 34);
|
||||
write_buffer_8x8(in8x8, &output[8 * stride + 8], stride, shift, bd);
|
||||
write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
|
||||
}
|
||||
|
||||
static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
|
||||
@@ -1207,7 +1311,7 @@ void vp10_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case DCT_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_dct_adst_16;
|
||||
@@ -1217,7 +1321,7 @@ void vp10_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_DCT:
|
||||
cfg = &inv_txfm_2d_cfg_adst_dct_16;
|
||||
@@ -1227,7 +1331,7 @@ void vp10_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_16;
|
||||
@@ -1237,8 +1341,60 @@ void vp10_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
|
||||
write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
#if CONFIG_EXT_TX
|
||||
case FLIPADST_DCT:
|
||||
cfg = &inv_txfm_2d_cfg_adst_dct_16;
|
||||
load_buffer_16x16(coeff, in);
|
||||
transpose_16x16(in, out);
|
||||
idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
case DCT_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_dct_adst_16;
|
||||
load_buffer_16x16(coeff, in);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case ADST_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_16;
|
||||
load_buffer_16x16(coeff, in);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
|
||||
break;
|
||||
case FLIPADST_FLIPADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_16;
|
||||
load_buffer_16x16(coeff, in);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
case FLIPADST_ADST:
|
||||
cfg = &inv_txfm_2d_cfg_adst_adst_16;
|
||||
load_buffer_16x16(coeff, in);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
|
||||
round_shift_16x16(in, -cfg->shift[0]);
|
||||
transpose_16x16(in, out);
|
||||
iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
|
||||
write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user