Added sse2 inverse 8x16 and 16x8 transforms
Change-Id: I43628407b11e5c8e6af4df69f2acdc67ac827834
This commit is contained in:

committed by
Debargha Mukherjee

parent
71e4553c3b
commit
1baecfeb03
@@ -1308,7 +1308,7 @@ void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
}
|
||||
}
|
||||
|
||||
static void iadst16_8col(__m128i *in) {
|
||||
void iadst16_8col(__m128i *in) {
|
||||
// perform 16x16 1-D ADST for 8 columns
|
||||
__m128i s[16], x[16], u[32], v[32];
|
||||
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
|
||||
@@ -1778,7 +1778,7 @@ static void iadst16_8col(__m128i *in) {
|
||||
in[15] = _mm_sub_epi16(kZero, s[1]);
|
||||
}
|
||||
|
||||
static void idct16_8col(__m128i *in) {
|
||||
void idct16_8col(__m128i *in) {
|
||||
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
|
||||
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
|
||||
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
|
||||
|
@@ -187,6 +187,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
|
||||
RECON_AND_STORE(dest + 15 * stride, in[15]);
|
||||
}
|
||||
|
||||
void iadst16_8col(__m128i *in);
|
||||
void idct16_8col(__m128i *in);
|
||||
void idct4_sse2(__m128i *in);
|
||||
void idct8_sse2(__m128i *in);
|
||||
void idct16_sse2(__m128i *in0, __m128i *in1);
|
||||
|
@@ -73,6 +73,14 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
|
||||
return _mm_srli_epi32(v_tmp_d, bits);
|
||||
}
|
||||
|
||||
// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
|
||||
static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
|
||||
const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
|
||||
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
|
||||
return _mm_srai_epi32(v_tmp_d, bits);
|
||||
}
|
||||
|
||||
// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
|
||||
static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
|
||||
const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
|
||||
const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
|
||||
|
@@ -60,23 +60,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x4_16_add/;
|
||||
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
if (aom_config("CONFIG_EXT_TX") eq "yes") {
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add/;
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add/;
|
||||
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add/;
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add/;
|
||||
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
}
|
||||
|
||||
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x8_64_add/;
|
||||
@@ -87,23 +89,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x4_16_add sse2/;
|
||||
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
if (aom_config("CONFIG_EXT_TX") eq "yes") {
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add/;
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add sse2/;
|
||||
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add/;
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add sse2/;
|
||||
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
}
|
||||
|
||||
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x8_64_add sse2/;
|
||||
@@ -117,23 +121,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x4_16_add/;
|
||||
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
if (aom_config("CONFIG_EXT_TX") eq "yes") {
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add/;
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add/;
|
||||
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add/;
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add/;
|
||||
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
}
|
||||
|
||||
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x8_64_add/;
|
||||
@@ -144,23 +150,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
|
||||
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
if (aom_config("CONFIG_EXT_TX") eq "yes") {
|
||||
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht4x8_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x4_32_add/;
|
||||
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add/;
|
||||
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x16_128_add sse2/;
|
||||
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add/;
|
||||
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x8_128_add sse2/;
|
||||
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht16x32_512_add/;
|
||||
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht32x16_512_add/;
|
||||
}
|
||||
|
||||
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
|
||||
specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
|
||||
@@ -274,23 +282,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht4x4_16_add/;
|
||||
|
||||
add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht4x8_32_add/;
|
||||
if (aom_config("CONFIG_EXT_TX") eq "yes") {
|
||||
add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht4x8_32_add/;
|
||||
|
||||
add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht8x4_32_add/;
|
||||
add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht8x4_32_add/;
|
||||
|
||||
add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht8x16_128_add/;
|
||||
add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht8x16_128_add/;
|
||||
|
||||
add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht16x8_128_add/;
|
||||
add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht16x8_128_add/;
|
||||
|
||||
add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht16x32_512_add/;
|
||||
add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht16x32_512_add/;
|
||||
|
||||
add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht32x16_512_add/;
|
||||
add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht32x16_512_add/;
|
||||
}
|
||||
|
||||
add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
|
||||
specialize qw/av1_highbd_iht8x8_64_add/;
|
||||
|
@@ -11,6 +11,7 @@
|
||||
|
||||
#include "./av1_rtcd.h"
|
||||
#include "aom_dsp/x86/inv_txfm_sse2.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "aom_dsp/x86/txfm_common_sse2.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "av1/common/enums.h"
|
||||
@@ -303,3 +304,535 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
dest += 8;
|
||||
write_buffer_8x16(dest, in1, stride);
|
||||
}
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
static void iidtx16_8col(__m128i *in) {
|
||||
const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
|
||||
const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
|
||||
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
__m128i y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
|
||||
in[0] = _mm_slli_epi16(in[0], 1);
|
||||
in[1] = _mm_slli_epi16(in[1], 1);
|
||||
in[2] = _mm_slli_epi16(in[2], 1);
|
||||
in[3] = _mm_slli_epi16(in[3], 1);
|
||||
in[4] = _mm_slli_epi16(in[4], 1);
|
||||
in[5] = _mm_slli_epi16(in[5], 1);
|
||||
in[6] = _mm_slli_epi16(in[6], 1);
|
||||
in[7] = _mm_slli_epi16(in[7], 1);
|
||||
in[8] = _mm_slli_epi16(in[8], 1);
|
||||
in[9] = _mm_slli_epi16(in[9], 1);
|
||||
in[10] = _mm_slli_epi16(in[10], 1);
|
||||
in[11] = _mm_slli_epi16(in[11], 1);
|
||||
in[12] = _mm_slli_epi16(in[12], 1);
|
||||
in[13] = _mm_slli_epi16(in[13], 1);
|
||||
in[14] = _mm_slli_epi16(in[14], 1);
|
||||
in[15] = _mm_slli_epi16(in[15], 1);
|
||||
|
||||
v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
|
||||
v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
|
||||
v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
|
||||
v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
|
||||
v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
|
||||
v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
|
||||
v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
|
||||
v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
|
||||
|
||||
u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
|
||||
u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
|
||||
u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
|
||||
u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
|
||||
u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
|
||||
u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
|
||||
u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
|
||||
u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
|
||||
|
||||
x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
|
||||
x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
|
||||
x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
|
||||
x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
|
||||
x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
|
||||
x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
|
||||
x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
|
||||
x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
|
||||
|
||||
y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
|
||||
y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
|
||||
y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
|
||||
y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
|
||||
y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
|
||||
y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
|
||||
y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
|
||||
y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
|
||||
|
||||
v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
|
||||
v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
|
||||
v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
|
||||
v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
|
||||
v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
|
||||
v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
|
||||
v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
|
||||
v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
|
||||
|
||||
x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
|
||||
x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
|
||||
x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
|
||||
x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
|
||||
x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
|
||||
x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
|
||||
x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
|
||||
x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
|
||||
|
||||
u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
|
||||
u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
|
||||
u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
|
||||
u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
|
||||
u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
|
||||
u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
|
||||
u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
|
||||
u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
|
||||
|
||||
y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
|
||||
y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
|
||||
y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
|
||||
y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
|
||||
y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
|
||||
y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
|
||||
y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
|
||||
y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
|
||||
|
||||
v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
|
||||
v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
|
||||
v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
|
||||
v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
|
||||
v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
|
||||
v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
|
||||
v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
|
||||
v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
|
||||
x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
|
||||
x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
|
||||
x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
|
||||
x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
|
||||
x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
|
||||
x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
|
||||
x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
|
||||
u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
|
||||
u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
|
||||
u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
|
||||
u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
|
||||
u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
|
||||
u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
|
||||
u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
|
||||
y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
|
||||
y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
|
||||
y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
|
||||
y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
|
||||
y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
|
||||
y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
|
||||
y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
|
||||
v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
|
||||
v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
|
||||
v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
|
||||
v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
|
||||
v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
|
||||
v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
|
||||
v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
|
||||
|
||||
x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
|
||||
x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
|
||||
x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
|
||||
x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
|
||||
x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
|
||||
x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
|
||||
x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
|
||||
x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
|
||||
|
||||
u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
|
||||
u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
|
||||
u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
|
||||
u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
|
||||
u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
|
||||
u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
|
||||
u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
|
||||
u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
|
||||
|
||||
y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
|
||||
y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
|
||||
y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
|
||||
y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
|
||||
y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
|
||||
y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
|
||||
y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
|
||||
y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
|
||||
|
||||
in[0] = _mm_packs_epi32(v0, x0);
|
||||
in[1] = _mm_packs_epi32(v1, x1);
|
||||
in[2] = _mm_packs_epi32(v2, x2);
|
||||
in[3] = _mm_packs_epi32(v3, x3);
|
||||
in[4] = _mm_packs_epi32(v4, x4);
|
||||
in[5] = _mm_packs_epi32(v5, x5);
|
||||
in[6] = _mm_packs_epi32(v6, x6);
|
||||
in[7] = _mm_packs_epi32(v7, x7);
|
||||
|
||||
in[8] = _mm_packs_epi32(u0, y0);
|
||||
in[9] = _mm_packs_epi32(u1, y1);
|
||||
in[10] = _mm_packs_epi32(u2, y2);
|
||||
in[11] = _mm_packs_epi32(u3, y3);
|
||||
in[12] = _mm_packs_epi32(u4, y4);
|
||||
in[13] = _mm_packs_epi32(u5, y5);
|
||||
in[14] = _mm_packs_epi32(u6, y6);
|
||||
in[15] = _mm_packs_epi32(u7, y7);
|
||||
}
|
||||
|
||||
static void iidtx8_sse2(__m128i *in) {
|
||||
in[0] = _mm_slli_epi16(in[0], 1);
|
||||
in[1] = _mm_slli_epi16(in[1], 1);
|
||||
in[2] = _mm_slli_epi16(in[2], 1);
|
||||
in[3] = _mm_slli_epi16(in[3], 1);
|
||||
in[4] = _mm_slli_epi16(in[4], 1);
|
||||
in[5] = _mm_slli_epi16(in[5], 1);
|
||||
in[6] = _mm_slli_epi16(in[6], 1);
|
||||
in[7] = _mm_slli_epi16(in[7], 1);
|
||||
}
|
||||
|
||||
// load 8x8 array
|
||||
static INLINE void flip_buffer_lr_8x8(__m128i *in) {
|
||||
in[0] = mm_reverse_epi16(in[0]);
|
||||
in[1] = mm_reverse_epi16(in[1]);
|
||||
in[2] = mm_reverse_epi16(in[2]);
|
||||
in[3] = mm_reverse_epi16(in[3]);
|
||||
in[4] = mm_reverse_epi16(in[4]);
|
||||
in[5] = mm_reverse_epi16(in[5]);
|
||||
in[6] = mm_reverse_epi16(in[6]);
|
||||
in[7] = mm_reverse_epi16(in[7]);
|
||||
}
|
||||
|
||||
static INLINE void scale_sqrt2_8x8(__m128i *in) {
|
||||
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
|
||||
// for each element
|
||||
const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
|
||||
|
||||
const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
|
||||
const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
|
||||
const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
|
||||
const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
|
||||
const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
|
||||
const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
|
||||
const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
|
||||
const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
|
||||
const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
|
||||
const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
|
||||
const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
|
||||
const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
|
||||
const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
|
||||
const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
|
||||
const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
|
||||
const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
|
||||
|
||||
const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
|
||||
const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
|
||||
const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
|
||||
const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
|
||||
const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
|
||||
const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
|
||||
const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
|
||||
const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
|
||||
const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
|
||||
const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
|
||||
const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
|
||||
const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
|
||||
const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
|
||||
const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
|
||||
const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
|
||||
const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
|
||||
|
||||
in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
|
||||
in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
|
||||
in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
|
||||
in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
|
||||
in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
|
||||
in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
|
||||
in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
|
||||
in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
|
||||
xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
|
||||
}
|
||||
|
||||
void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
int stride, int tx_type) {
|
||||
__m128i in[16];
|
||||
|
||||
in[0] = load_input_data(input + 0 * 8);
|
||||
in[1] = load_input_data(input + 1 * 8);
|
||||
in[2] = load_input_data(input + 2 * 8);
|
||||
in[3] = load_input_data(input + 3 * 8);
|
||||
in[4] = load_input_data(input + 4 * 8);
|
||||
in[5] = load_input_data(input + 5 * 8);
|
||||
in[6] = load_input_data(input + 6 * 8);
|
||||
in[7] = load_input_data(input + 7 * 8);
|
||||
|
||||
in[8] = load_input_data(input + 8 * 8);
|
||||
in[9] = load_input_data(input + 9 * 8);
|
||||
in[10] = load_input_data(input + 10 * 8);
|
||||
in[11] = load_input_data(input + 11 * 8);
|
||||
in[12] = load_input_data(input + 12 * 8);
|
||||
in[13] = load_input_data(input + 13 * 8);
|
||||
in[14] = load_input_data(input + 14 * 8);
|
||||
in[15] = load_input_data(input + 15 * 8);
|
||||
|
||||
// Row transform
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
case ADST_DCT:
|
||||
case FLIPADST_DCT:
|
||||
case H_DCT:
|
||||
idct8_sse2(in);
|
||||
array_transpose_8x8(in, in);
|
||||
idct8_sse2(in + 8);
|
||||
array_transpose_8x8(in + 8, in + 8);
|
||||
break;
|
||||
case DCT_ADST:
|
||||
case ADST_ADST:
|
||||
case DCT_FLIPADST:
|
||||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
case H_ADST:
|
||||
case H_FLIPADST:
|
||||
iadst8_sse2(in);
|
||||
array_transpose_8x8(in, in);
|
||||
iadst8_sse2(in + 8);
|
||||
array_transpose_8x8(in + 8, in + 8);
|
||||
break;
|
||||
case V_FLIPADST:
|
||||
case V_ADST:
|
||||
case V_DCT:
|
||||
case IDTX:
|
||||
iidtx8_sse2(in);
|
||||
iidtx8_sse2(in + 8);
|
||||
break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
scale_sqrt2_8x8(in);
|
||||
scale_sqrt2_8x8(in + 8);
|
||||
|
||||
// Column transform
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
case DCT_ADST:
|
||||
case DCT_FLIPADST:
|
||||
case V_DCT: idct16_8col(in); break;
|
||||
case ADST_DCT:
|
||||
case ADST_ADST:
|
||||
case FLIPADST_ADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_FLIPADST:
|
||||
case FLIPADST_DCT:
|
||||
case V_ADST:
|
||||
case V_FLIPADST: iadst16_8col(in); break;
|
||||
case H_DCT:
|
||||
case H_ADST:
|
||||
case H_FLIPADST:
|
||||
case IDTX: iidtx16_8col(in); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
case ADST_DCT:
|
||||
case H_DCT:
|
||||
case DCT_ADST:
|
||||
case ADST_ADST:
|
||||
case H_ADST:
|
||||
case V_ADST:
|
||||
case V_DCT:
|
||||
case IDTX: write_buffer_8x16(dest, in, stride); break;
|
||||
case FLIPADST_DCT:
|
||||
case FLIPADST_ADST:
|
||||
case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
|
||||
case DCT_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case H_FLIPADST:
|
||||
flip_buffer_lr_8x8(in);
|
||||
flip_buffer_lr_8x8(in + 8);
|
||||
write_buffer_8x16(dest, in, stride);
|
||||
break;
|
||||
case FLIPADST_FLIPADST:
|
||||
flip_buffer_lr_8x8(in);
|
||||
flip_buffer_lr_8x8(in + 8);
|
||||
write_buffer_8x16(dest + stride * 15, in, -stride);
|
||||
break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
|
||||
int stride) {
|
||||
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
// Final rounding and shift
|
||||
in[0] = _mm_adds_epi16(in[0], final_rounding);
|
||||
in[1] = _mm_adds_epi16(in[1], final_rounding);
|
||||
in[2] = _mm_adds_epi16(in[2], final_rounding);
|
||||
in[3] = _mm_adds_epi16(in[3], final_rounding);
|
||||
in[4] = _mm_adds_epi16(in[4], final_rounding);
|
||||
in[5] = _mm_adds_epi16(in[5], final_rounding);
|
||||
in[6] = _mm_adds_epi16(in[6], final_rounding);
|
||||
in[7] = _mm_adds_epi16(in[7], final_rounding);
|
||||
|
||||
in[0] = _mm_srai_epi16(in[0], 6);
|
||||
in[1] = _mm_srai_epi16(in[1], 6);
|
||||
in[2] = _mm_srai_epi16(in[2], 6);
|
||||
in[3] = _mm_srai_epi16(in[3], 6);
|
||||
in[4] = _mm_srai_epi16(in[4], 6);
|
||||
in[5] = _mm_srai_epi16(in[5], 6);
|
||||
in[6] = _mm_srai_epi16(in[6], 6);
|
||||
in[7] = _mm_srai_epi16(in[7], 6);
|
||||
|
||||
RECON_AND_STORE(dest + 0 * stride, in[0]);
|
||||
RECON_AND_STORE(dest + 1 * stride, in[1]);
|
||||
RECON_AND_STORE(dest + 2 * stride, in[2]);
|
||||
RECON_AND_STORE(dest + 3 * stride, in[3]);
|
||||
RECON_AND_STORE(dest + 4 * stride, in[4]);
|
||||
RECON_AND_STORE(dest + 5 * stride, in[5]);
|
||||
RECON_AND_STORE(dest + 6 * stride, in[6]);
|
||||
RECON_AND_STORE(dest + 7 * stride, in[7]);
|
||||
}
|
||||
|
||||
void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
|
||||
int stride, int tx_type) {
|
||||
__m128i in[16];
|
||||
|
||||
// Transpose 16x8 input into in[]
|
||||
in[0] = load_input_data(input + 0 * 16);
|
||||
in[1] = load_input_data(input + 1 * 16);
|
||||
in[2] = load_input_data(input + 2 * 16);
|
||||
in[3] = load_input_data(input + 3 * 16);
|
||||
in[4] = load_input_data(input + 4 * 16);
|
||||
in[5] = load_input_data(input + 5 * 16);
|
||||
in[6] = load_input_data(input + 6 * 16);
|
||||
in[7] = load_input_data(input + 7 * 16);
|
||||
array_transpose_8x8(in, in);
|
||||
|
||||
in[8] = load_input_data(input + 8 + 0 * 16);
|
||||
in[9] = load_input_data(input + 8 + 1 * 16);
|
||||
in[10] = load_input_data(input + 8 + 2 * 16);
|
||||
in[11] = load_input_data(input + 8 + 3 * 16);
|
||||
in[12] = load_input_data(input + 8 + 4 * 16);
|
||||
in[13] = load_input_data(input + 8 + 5 * 16);
|
||||
in[14] = load_input_data(input + 8 + 6 * 16);
|
||||
in[15] = load_input_data(input + 8 + 7 * 16);
|
||||
array_transpose_8x8(in + 8, in + 8);
|
||||
|
||||
// Row transform
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
case ADST_DCT:
|
||||
case FLIPADST_DCT:
|
||||
case H_DCT: idct16_8col(in); break;
|
||||
case DCT_ADST:
|
||||
case ADST_ADST:
|
||||
case DCT_FLIPADST:
|
||||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
case H_ADST:
|
||||
case H_FLIPADST: iadst16_8col(in); break;
|
||||
case V_FLIPADST:
|
||||
case V_ADST:
|
||||
case V_DCT:
|
||||
case IDTX: iidtx16_8col(in); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
|
||||
// Scale
|
||||
scale_sqrt2_8x8(in);
|
||||
scale_sqrt2_8x8(in + 8);
|
||||
|
||||
// Column transform
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
case DCT_ADST:
|
||||
case DCT_FLIPADST:
|
||||
case V_DCT:
|
||||
idct8_sse2(in);
|
||||
idct8_sse2(in + 8);
|
||||
break;
|
||||
case ADST_DCT:
|
||||
case ADST_ADST:
|
||||
case FLIPADST_ADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_FLIPADST:
|
||||
case FLIPADST_DCT:
|
||||
case V_ADST:
|
||||
case V_FLIPADST:
|
||||
iadst8_sse2(in);
|
||||
iadst8_sse2(in + 8);
|
||||
break;
|
||||
case H_DCT:
|
||||
case H_ADST:
|
||||
case H_FLIPADST:
|
||||
case IDTX:
|
||||
array_transpose_8x8(in, in);
|
||||
array_transpose_8x8(in + 8, in + 8);
|
||||
iidtx8_sse2(in);
|
||||
iidtx8_sse2(in + 8);
|
||||
break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
case ADST_DCT:
|
||||
case H_DCT:
|
||||
case DCT_ADST:
|
||||
case ADST_ADST:
|
||||
case H_ADST:
|
||||
case V_ADST:
|
||||
case V_DCT:
|
||||
case IDTX:
|
||||
write_buffer_8x8_round6(dest, in, stride);
|
||||
write_buffer_8x8_round6(dest + 8, in + 8, stride);
|
||||
break;
|
||||
case FLIPADST_DCT:
|
||||
case FLIPADST_ADST:
|
||||
case V_FLIPADST:
|
||||
write_buffer_8x8_round6(dest + stride * 7, in, -stride);
|
||||
write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
|
||||
break;
|
||||
case DCT_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case H_FLIPADST:
|
||||
flip_buffer_lr_8x8(in);
|
||||
flip_buffer_lr_8x8(in + 8);
|
||||
write_buffer_8x8_round6(dest, in + 8, stride);
|
||||
write_buffer_8x8_round6(dest + 8, in, stride);
|
||||
break;
|
||||
case FLIPADST_FLIPADST:
|
||||
flip_buffer_lr_8x8(in);
|
||||
flip_buffer_lr_8x8(in + 8);
|
||||
write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
|
||||
write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
|
||||
break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_EXT_TX
|
||||
|
96
test/av1_iht16x8_test.cc
Normal file
96
test/av1_iht16x8_test.cc
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "./av1_rtcd.h"
|
||||
|
||||
#include "aom_ports/mem.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/transform_test_base.h"
|
||||
#include "test/util.h"
|
||||
|
||||
using libaom_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
using std::tr1::tuple;
|
||||
using libaom_test::FhtFunc;
|
||||
using libaom_test::IhtFunc;
|
||||
typedef tuple<IhtFunc, int, aom_bit_depth_t, int> Ht16x8Param;
|
||||
|
||||
void fht16x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
|
||||
av1_fht16x8_c(in, out, stride, tx_type);
|
||||
}
|
||||
|
||||
void iht16x8_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
av1_iht16x8_128_add_c(in, out, stride, tx_type);
|
||||
}
|
||||
|
||||
class AV1Trans16x8IHT : public libaom_test::TransformTestBase,
|
||||
public ::testing::TestWithParam<Ht16x8Param> {
|
||||
public:
|
||||
virtual ~AV1Trans16x8IHT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
inv_txfm_ = GET_PARAM(0);
|
||||
tx_type_ = GET_PARAM(1);
|
||||
pitch_ = 8;
|
||||
inv_txfm_ref = iht16x8_ref;
|
||||
fwd_txfm_ref = fht16x8_ref;
|
||||
bit_depth_ = GET_PARAM(2);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
num_coeffs_ = GET_PARAM(3);
|
||||
}
|
||||
virtual void TearDown() { libaom_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
|
||||
(void)in;
|
||||
(void)out;
|
||||
(void)stride;
|
||||
}
|
||||
|
||||
void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride, tx_type_);
|
||||
}
|
||||
|
||||
IhtFunc inv_txfm_;
|
||||
};
|
||||
|
||||
TEST_P(AV1Trans16x8IHT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSE2
|
||||
const Ht16x8Param kArrayHt16x8Param_sse2[] = {
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 0, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 1, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 2, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 3, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 4, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 5, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 6, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 7, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 8, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 9, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 10, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 11, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 12, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 13, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 14, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht16x8_128_add_sse2, 15, AOM_BITS_8, 128)
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8IHT,
|
||||
::testing::ValuesIn(kArrayHt16x8Param_sse2));
|
||||
#endif // HAVE_SSE2
|
||||
} // namespace
|
97
test/av1_iht8x16_test.cc
Normal file
97
test/av1_iht8x16_test.cc
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "./av1_rtcd.h"
|
||||
|
||||
#include "aom_ports/mem.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/transform_test_base.h"
|
||||
#include "test/util.h"
|
||||
|
||||
using libaom_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
using std::tr1::tuple;
|
||||
using libaom_test::FhtFunc;
|
||||
using libaom_test::IhtFunc;
|
||||
typedef tuple<IhtFunc, int, aom_bit_depth_t, int> Ht8x16Param;
|
||||
|
||||
void fht8x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
|
||||
av1_fht8x16_c(in, out, stride, tx_type);
|
||||
}
|
||||
|
||||
void iht8x16_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
av1_iht8x16_128_add_c(in, out, stride, tx_type);
|
||||
}
|
||||
|
||||
class AV1Trans8x16IHT : public libaom_test::TransformTestBase,
|
||||
public ::testing::TestWithParam<Ht8x16Param> {
|
||||
public:
|
||||
virtual ~AV1Trans8x16IHT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
inv_txfm_ = GET_PARAM(0);
|
||||
tx_type_ = GET_PARAM(1);
|
||||
pitch_ = 8;
|
||||
inv_txfm_ref = iht8x16_ref;
|
||||
fwd_txfm_ref = fht8x16_ref;
|
||||
bit_depth_ = GET_PARAM(2);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
num_coeffs_ = GET_PARAM(3);
|
||||
}
|
||||
virtual void TearDown() { libaom_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
|
||||
(void)in;
|
||||
(void)out;
|
||||
(void)stride;
|
||||
}
|
||||
|
||||
void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride, tx_type_);
|
||||
}
|
||||
|
||||
IhtFunc inv_txfm_;
|
||||
};
|
||||
|
||||
TEST_P(AV1Trans8x16IHT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSE2
|
||||
const Ht8x16Param kArrayHt8x16Param_sse2[] = {
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 0, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 1, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 2, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 3, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 4, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 5, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 6, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 7, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 8, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 9, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 10, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 11, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 12, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 13, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 14, AOM_BITS_8, 128),
|
||||
make_tuple(&av1_iht8x16_128_add_sse2, 15, AOM_BITS_8, 128)
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16IHT,
|
||||
::testing::ValuesIn(kArrayHt8x16Param_sse2));
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
} // namespace
|
@@ -139,6 +139,8 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x16_test.cc
|
||||
ifeq ($(CONFIG_EXT_TX),yes)
|
||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x16_test.cc
|
||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
|
||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht8x16_test.cc
|
||||
LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht16x8_test.cc
|
||||
endif
|
||||
LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE) += av1_ext_tile_test.cc
|
||||
|
||||
|
@@ -28,6 +28,9 @@ const int kDctMaxValue = 16384;
|
||||
typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
|
||||
int tx_type);
|
||||
|
||||
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
||||
int tx_type);
|
||||
|
||||
class TransformTestBase {
|
||||
public:
|
||||
virtual ~TransformTestBase() {}
|
||||
@@ -128,9 +131,16 @@ class TransformTestBase {
|
||||
aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-mask_, mask_].
|
||||
for (int j = 0; j < num_coeffs_; ++j)
|
||||
for (int j = 0; j < num_coeffs_; ++j) {
|
||||
input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
|
||||
if (bit_depth_ == AOM_BITS_8) {
|
||||
output_block[j] = output_ref_block[j] = rnd.Rand8();
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
} else {
|
||||
output_block[j] = output_ref_block[j] = rnd.Rand16() & mask_;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
|
||||
ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
|
||||
@@ -147,6 +157,44 @@ class TransformTestBase {
|
||||
aom_free(output_block);
|
||||
}
|
||||
|
||||
void RunInvCoeffCheck() {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = 5000;
|
||||
|
||||
int16_t *input_block = reinterpret_cast<int16_t *>(
|
||||
aom_memalign(16, sizeof(int16_t) * num_coeffs_));
|
||||
tran_low_t *trans_block = reinterpret_cast<tran_low_t *>(
|
||||
aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
|
||||
uint8_t *output_block = reinterpret_cast<uint8_t *>(
|
||||
aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
|
||||
uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
|
||||
aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-mask_, mask_].
|
||||
for (int j = 0; j < num_coeffs_; ++j) {
|
||||
input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
|
||||
output_ref_block[j] = rnd.Rand16() & mask_;
|
||||
output_block[j] = output_ref_block[j];
|
||||
}
|
||||
|
||||
fwd_txfm_ref(input_block, trans_block, pitch_, tx_type_);
|
||||
|
||||
inv_txfm_ref(trans_block, output_ref_block, pitch_, tx_type_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, pitch_));
|
||||
|
||||
for (int j = 0; j < num_coeffs_; ++j) {
|
||||
ASSERT_EQ(output_block[j], output_ref_block[j])
|
||||
<< "Error: not bit-exact result at index: " << j
|
||||
<< " at test block: " << i;
|
||||
}
|
||||
}
|
||||
aom_free(input_block);
|
||||
aom_free(trans_block);
|
||||
aom_free(output_ref_block);
|
||||
aom_free(output_block);
|
||||
}
|
||||
|
||||
void RunMemCheck() {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = 5000;
|
||||
@@ -259,6 +307,7 @@ class TransformTestBase {
|
||||
int pitch_;
|
||||
int tx_type_;
|
||||
FhtFunc fwd_txfm_ref;
|
||||
IhtFunc inv_txfm_ref;
|
||||
aom_bit_depth_t bit_depth_;
|
||||
int mask_;
|
||||
int num_coeffs_;
|
||||
|
Reference in New Issue
Block a user