Merge "Making input pointer constant for all fdct/fht functions."
This commit is contained in:
@@ -257,17 +257,18 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
|
typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride);
|
||||||
typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
|
typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride);
|
||||||
typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
|
typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
|
||||||
typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
|
int tx_type);
|
||||||
|
typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
|
||||||
int tx_type);
|
int tx_type);
|
||||||
|
|
||||||
void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
|
void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
|
||||||
vp9_fdct16x16_c(in, out, stride);
|
vp9_fdct16x16_c(in, out, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
|
void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
|
||||||
vp9_short_fht16x16_c(in, out, stride, tx_type);
|
vp9_short_fht16x16_c(in, out, stride, tx_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -74,8 +74,8 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
|
typedef void (*fwd_txfm_t)(const int16_t *in, int16_t *out, int stride);
|
||||||
typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *dst, int stride);
|
typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *out, int stride);
|
||||||
|
|
||||||
class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
|
class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
|
||||||
public:
|
public:
|
||||||
|
@@ -28,17 +28,18 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
|
|||||||
using libvpx_test::ACMRandom;
|
using libvpx_test::ACMRandom;
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
|
typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride);
|
||||||
typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
|
typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride);
|
||||||
typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
|
typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
|
||||||
typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
|
int tx_type);
|
||||||
int tx_type);
|
typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
|
||||||
|
int tx_type);
|
||||||
|
|
||||||
void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
|
void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
|
||||||
vp9_fdct8x8_c(in, out, stride);
|
vp9_fdct8x8_c(in, out, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
void fht8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
|
void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
|
||||||
vp9_short_fht8x8_c(in, out, stride, tx_type);
|
vp9_short_fht8x8_c(in, out, stride, tx_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -689,31 +689,31 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# fdct functions
|
# fdct functions
|
||||||
prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
|
prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
|
||||||
specialize vp9_short_fht4x4 sse2
|
specialize vp9_short_fht4x4 sse2
|
||||||
|
|
||||||
prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
|
prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
|
||||||
specialize vp9_short_fht8x8 sse2
|
specialize vp9_short_fht8x8 sse2
|
||||||
|
|
||||||
prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
|
prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
|
||||||
specialize vp9_short_fht16x16 sse2
|
specialize vp9_short_fht16x16 sse2
|
||||||
|
|
||||||
prototype void vp9_fwht4x4 "int16_t *input, int16_t *output, int stride"
|
prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
|
||||||
specialize vp9_fwht4x4
|
specialize vp9_fwht4x4
|
||||||
|
|
||||||
prototype void vp9_fdct4x4 "int16_t *input, int16_t *output, int stride"
|
prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
|
||||||
specialize vp9_fdct4x4 sse2
|
specialize vp9_fdct4x4 sse2
|
||||||
|
|
||||||
prototype void vp9_fdct8x8 "int16_t *input, int16_t *output, int stride"
|
prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
|
||||||
specialize vp9_fdct8x8 sse2
|
specialize vp9_fdct8x8 sse2
|
||||||
|
|
||||||
prototype void vp9_fdct16x16 "int16_t *input, int16_t *output, int stride"
|
prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
|
||||||
specialize vp9_fdct16x16 sse2
|
specialize vp9_fdct16x16 sse2
|
||||||
|
|
||||||
prototype void vp9_fdct32x32 "int16_t *input, int16_t *output, int stride"
|
prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
|
||||||
specialize vp9_fdct32x32 sse2
|
specialize vp9_fdct32x32 sse2
|
||||||
|
|
||||||
prototype void vp9_fdct32x32_rd "int16_t *input, int16_t *output, int stride"
|
prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
|
||||||
specialize vp9_fdct32x32_rd sse2
|
specialize vp9_fdct32x32_rd sse2
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@@ -173,7 +173,7 @@ struct macroblock {
|
|||||||
BLOCK_SIZE sb_partitioning[4];
|
BLOCK_SIZE sb_partitioning[4];
|
||||||
BLOCK_SIZE sb64_partitioning;
|
BLOCK_SIZE sb64_partitioning;
|
||||||
|
|
||||||
void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
|
void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO(jingning): the variables used here are little complicated. need further
|
// TODO(jingning): the variables used here are little complicated. need further
|
||||||
|
@@ -36,7 +36,7 @@ static void fdct4(const int16_t *input, int16_t *output) {
|
|||||||
output[3] = dct_const_round_shift(temp2);
|
output[3] = dct_const_round_shift(temp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
|
void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
|
||||||
// The 2D transform is done with two passes which are actually pretty
|
// The 2D transform is done with two passes which are actually pretty
|
||||||
// similar. In the first one, we transform the columns and transpose
|
// similar. In the first one, we transform the columns and transpose
|
||||||
// the results. In the second one, we transform the rows. To achieve that,
|
// the results. In the second one, we transform the rows. To achieve that,
|
||||||
@@ -46,7 +46,7 @@ void vp9_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
|
|||||||
int pass;
|
int pass;
|
||||||
// We need an intermediate buffer between passes.
|
// We need an intermediate buffer between passes.
|
||||||
int16_t intermediate[4 * 4];
|
int16_t intermediate[4 * 4];
|
||||||
int16_t *in = input;
|
const int16_t *in = input;
|
||||||
int16_t *out = intermediate;
|
int16_t *out = intermediate;
|
||||||
// Do the two transform/transpose passes
|
// Do the two transform/transpose passes
|
||||||
for (pass = 0; pass < 2; ++pass) {
|
for (pass = 0; pass < 2; ++pass) {
|
||||||
@@ -148,8 +148,8 @@ static const transform_2d FHT_4[] = {
|
|||||||
{ fadst4, fadst4 } // ADST_ADST = 3
|
{ fadst4, fadst4 } // ADST_ADST = 3
|
||||||
};
|
};
|
||||||
|
|
||||||
void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
|
void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
|
||||||
int pitch, TX_TYPE tx_type) {
|
int stride, TX_TYPE tx_type) {
|
||||||
int16_t out[4 * 4];
|
int16_t out[4 * 4];
|
||||||
int16_t *outptr = &out[0];
|
int16_t *outptr = &out[0];
|
||||||
int i, j;
|
int i, j;
|
||||||
@@ -159,7 +159,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
|
|||||||
// Columns
|
// Columns
|
||||||
for (i = 0; i < 4; ++i) {
|
for (i = 0; i < 4; ++i) {
|
||||||
for (j = 0; j < 4; ++j)
|
for (j = 0; j < 4; ++j)
|
||||||
temp_in[j] = input[j * pitch + i] * 16;
|
temp_in[j] = input[j * stride + i] * 16;
|
||||||
if (i == 0 && temp_in[0])
|
if (i == 0 && temp_in[0])
|
||||||
temp_in[0] += 1;
|
temp_in[0] += 1;
|
||||||
ht.cols(temp_in, temp_out);
|
ht.cols(temp_in, temp_out);
|
||||||
@@ -229,7 +229,7 @@ static void fdct8(const int16_t *input, int16_t *output) {
|
|||||||
output[7] = dct_const_round_shift(t3);
|
output[7] = dct_const_round_shift(t3);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
|
void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int16_t intermediate[64];
|
int16_t intermediate[64];
|
||||||
|
|
||||||
@@ -300,7 +300,7 @@ void vp9_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
|
void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
|
||||||
// The 2D transform is done with two passes which are actually pretty
|
// The 2D transform is done with two passes which are actually pretty
|
||||||
// similar. In the first one, we transform the columns and transpose
|
// similar. In the first one, we transform the columns and transpose
|
||||||
// the results. In the second one, we transform the rows. To achieve that,
|
// the results. In the second one, we transform the rows. To achieve that,
|
||||||
@@ -310,7 +310,7 @@ void vp9_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
|
|||||||
int pass;
|
int pass;
|
||||||
// We need an intermediate buffer between passes.
|
// We need an intermediate buffer between passes.
|
||||||
int16_t intermediate[256];
|
int16_t intermediate[256];
|
||||||
int16_t *in = input;
|
const int16_t *in = input;
|
||||||
int16_t *out = intermediate;
|
int16_t *out = intermediate;
|
||||||
// Do the two transform/transpose passes
|
// Do the two transform/transpose passes
|
||||||
for (pass = 0; pass < 2; ++pass) {
|
for (pass = 0; pass < 2; ++pass) {
|
||||||
@@ -556,8 +556,8 @@ static const transform_2d FHT_8[] = {
|
|||||||
{ fadst8, fadst8 } // ADST_ADST = 3
|
{ fadst8, fadst8 } // ADST_ADST = 3
|
||||||
};
|
};
|
||||||
|
|
||||||
void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
|
void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
|
||||||
int pitch, TX_TYPE tx_type) {
|
int stride, TX_TYPE tx_type) {
|
||||||
int16_t out[64];
|
int16_t out[64];
|
||||||
int16_t *outptr = &out[0];
|
int16_t *outptr = &out[0];
|
||||||
int i, j;
|
int i, j;
|
||||||
@@ -567,7 +567,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
|
|||||||
// Columns
|
// Columns
|
||||||
for (i = 0; i < 8; ++i) {
|
for (i = 0; i < 8; ++i) {
|
||||||
for (j = 0; j < 8; ++j)
|
for (j = 0; j < 8; ++j)
|
||||||
temp_in[j] = input[j * pitch + i] * 4;
|
temp_in[j] = input[j * stride + i] * 4;
|
||||||
ht.cols(temp_in, temp_out);
|
ht.cols(temp_in, temp_out);
|
||||||
for (j = 0; j < 8; ++j)
|
for (j = 0; j < 8; ++j)
|
||||||
outptr[j * 8 + i] = temp_out[j];
|
outptr[j * 8 + i] = temp_out[j];
|
||||||
@@ -585,10 +585,10 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
|
|||||||
|
|
||||||
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
|
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
|
||||||
pixel. */
|
pixel. */
|
||||||
void vp9_fwht4x4_c(int16_t *input, int16_t *output, int stride) {
|
void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
|
||||||
int i;
|
int i;
|
||||||
int a1, b1, c1, d1, e1;
|
int a1, b1, c1, d1, e1;
|
||||||
int16_t *ip = input;
|
const int16_t *ip = input;
|
||||||
int16_t *op = output;
|
int16_t *op = output;
|
||||||
|
|
||||||
for (i = 0; i < 4; i++) {
|
for (i = 0; i < 4; i++) {
|
||||||
@@ -949,8 +949,8 @@ static const transform_2d FHT_16[] = {
|
|||||||
{ fadst16, fadst16 } // ADST_ADST = 3
|
{ fadst16, fadst16 } // ADST_ADST = 3
|
||||||
};
|
};
|
||||||
|
|
||||||
void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
|
void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
|
||||||
int pitch, TX_TYPE tx_type) {
|
int stride, TX_TYPE tx_type) {
|
||||||
int16_t out[256];
|
int16_t out[256];
|
||||||
int16_t *outptr = &out[0];
|
int16_t *outptr = &out[0];
|
||||||
int i, j;
|
int i, j;
|
||||||
@@ -960,7 +960,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
|
|||||||
// Columns
|
// Columns
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
for (j = 0; j < 16; ++j)
|
for (j = 0; j < 16; ++j)
|
||||||
temp_in[j] = input[j * pitch + i] * 4;
|
temp_in[j] = input[j * stride + i] * 4;
|
||||||
ht.cols(temp_in, temp_out);
|
ht.cols(temp_in, temp_out);
|
||||||
for (j = 0; j < 16; ++j)
|
for (j = 0; j < 16; ++j)
|
||||||
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
@@ -1311,7 +1311,7 @@ static void dct32_1d(const int *input, int *output, int round) {
|
|||||||
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
|
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
|
void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int output[32 * 32];
|
int output[32 * 32];
|
||||||
|
|
||||||
@@ -1339,7 +1339,7 @@ void vp9_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
|
|||||||
// Note that although we use dct_32_round in dct32_1d computation flow,
|
// Note that although we use dct_32_round in dct32_1d computation flow,
|
||||||
// this 2d fdct32x32 for rate-distortion optimization loop is operating
|
// this 2d fdct32x32 for rate-distortion optimization loop is operating
|
||||||
// within 16 bits precision.
|
// within 16 bits precision.
|
||||||
void vp9_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
|
void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
|
||||||
int i, j;
|
int i, j;
|
||||||
int output[32 * 32];
|
int output[32 * 32];
|
||||||
|
|
||||||
|
@@ -29,7 +29,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void FDCT32x32_2D(int16_t *input,
|
void FDCT32x32_2D(const int16_t *input,
|
||||||
int16_t *output_org, int stride) {
|
int16_t *output_org, int stride) {
|
||||||
// Calculate pre-multiplied strides
|
// Calculate pre-multiplied strides
|
||||||
const int str1 = stride;
|
const int str1 = stride;
|
||||||
@@ -93,13 +93,13 @@ void FDCT32x32_2D(int16_t *input,
|
|||||||
// Note: even though all the loads below are aligned, using the aligned
|
// Note: even though all the loads below are aligned, using the aligned
|
||||||
// intrinsic make the code slightly slower.
|
// intrinsic make the code slightly slower.
|
||||||
if (0 == pass) {
|
if (0 == pass) {
|
||||||
int16_t *in = &input[column_start];
|
const int16_t *in = &input[column_start];
|
||||||
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
|
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
|
||||||
// Note: the next four blocks could be in a loop. That would help the
|
// Note: the next four blocks could be in a loop. That would help the
|
||||||
// instruction cache but is actually slower.
|
// instruction cache but is actually slower.
|
||||||
{
|
{
|
||||||
int16_t *ina = in + 0 * str1;
|
const int16_t *ina = in + 0 * str1;
|
||||||
int16_t *inb = in + 31 * str1;
|
const int16_t *inb = in + 31 * str1;
|
||||||
__m128i *step1a = &step1[ 0];
|
__m128i *step1a = &step1[ 0];
|
||||||
__m128i *step1b = &step1[31];
|
__m128i *step1b = &step1[31];
|
||||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||||
@@ -128,8 +128,8 @@ void FDCT32x32_2D(int16_t *input,
|
|||||||
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
|
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
int16_t *ina = in + 4 * str1;
|
const int16_t *ina = in + 4 * str1;
|
||||||
int16_t *inb = in + 27 * str1;
|
const int16_t *inb = in + 27 * str1;
|
||||||
__m128i *step1a = &step1[ 4];
|
__m128i *step1a = &step1[ 4];
|
||||||
__m128i *step1b = &step1[27];
|
__m128i *step1b = &step1[27];
|
||||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||||
@@ -158,8 +158,8 @@ void FDCT32x32_2D(int16_t *input,
|
|||||||
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
|
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
int16_t *ina = in + 8 * str1;
|
const int16_t *ina = in + 8 * str1;
|
||||||
int16_t *inb = in + 23 * str1;
|
const int16_t *inb = in + 23 * str1;
|
||||||
__m128i *step1a = &step1[ 8];
|
__m128i *step1a = &step1[ 8];
|
||||||
__m128i *step1b = &step1[23];
|
__m128i *step1b = &step1[23];
|
||||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||||
@@ -188,8 +188,8 @@ void FDCT32x32_2D(int16_t *input,
|
|||||||
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
|
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
int16_t *ina = in + 12 * str1;
|
const int16_t *ina = in + 12 * str1;
|
||||||
int16_t *inb = in + 19 * str1;
|
const int16_t *inb = in + 19 * str1;
|
||||||
__m128i *step1a = &step1[12];
|
__m128i *step1a = &step1[12];
|
||||||
__m128i *step1b = &step1[19];
|
__m128i *step1b = &step1[19];
|
||||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||||
|
@@ -12,7 +12,7 @@
|
|||||||
#include "vp9/common/vp9_idct.h" // for cospi constants
|
#include "vp9/common/vp9_idct.h" // for cospi constants
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
void vp9_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
|
void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||||
// The 2D transform is done with two passes which are actually pretty
|
// The 2D transform is done with two passes which are actually pretty
|
||||||
// similar. In the first one, we transform the columns and transpose
|
// similar. In the first one, we transform the columns and transpose
|
||||||
// the results. In the second one, we transform the rows. To achieve that,
|
// the results. In the second one, we transform the rows. To achieve that,
|
||||||
@@ -111,7 +111,8 @@ void vp9_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
|
static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
|
||||||
|
int stride) {
|
||||||
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
|
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
|
||||||
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
|
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
|
||||||
__m128i mask;
|
__m128i mask;
|
||||||
@@ -242,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) {
|
|||||||
transpose_4x4(in);
|
transpose_4x4(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
|
void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
|
||||||
int stride, int tx_type) {
|
int stride, int tx_type) {
|
||||||
__m128i in[4];
|
__m128i in[4];
|
||||||
load_buffer_4x4(input, in, stride);
|
load_buffer_4x4(input, in, stride);
|
||||||
@@ -270,7 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
|
|||||||
write_buffer_4x4(output, in);
|
write_buffer_4x4(output, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
|
void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||||
int pass;
|
int pass;
|
||||||
// Constants
|
// Constants
|
||||||
// When we use them, in one case, they are all the same. In all others
|
// When we use them, in one case, they are all the same. In all others
|
||||||
@@ -527,15 +528,16 @@ void vp9_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// load 8x8 array
|
// load 8x8 array
|
||||||
static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
|
static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
|
||||||
in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));
|
int stride) {
|
||||||
in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));
|
in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
|
||||||
in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));
|
in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
|
||||||
in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));
|
in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
|
||||||
in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));
|
in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
|
||||||
in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));
|
in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
|
||||||
in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));
|
in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
|
||||||
in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));
|
in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
|
||||||
|
in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
|
||||||
|
|
||||||
in[0] = _mm_slli_epi16(in[0], 2);
|
in[0] = _mm_slli_epi16(in[0], 2);
|
||||||
in[1] = _mm_slli_epi16(in[1], 2);
|
in[1] = _mm_slli_epi16(in[1], 2);
|
||||||
@@ -1025,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) {
|
|||||||
array_transpose_8x8(in, in);
|
array_transpose_8x8(in, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
|
void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
|
||||||
int stride, int tx_type) {
|
int stride, int tx_type) {
|
||||||
__m128i in[8];
|
__m128i in[8];
|
||||||
load_buffer_8x8(input, in, stride);
|
load_buffer_8x8(input, in, stride);
|
||||||
@@ -1054,7 +1056,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
|
|||||||
write_buffer_8x8(output, in, 8);
|
write_buffer_8x8(output, in, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
|
void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||||
// The 2D transform is done with two passes which are actually pretty
|
// The 2D transform is done with two passes which are actually pretty
|
||||||
// similar. In the first one, we transform the columns and transpose
|
// similar. In the first one, we transform the columns and transpose
|
||||||
// the results. In the second one, we transform the rows. To achieve that,
|
// the results. In the second one, we transform the rows. To achieve that,
|
||||||
@@ -1064,7 +1066,7 @@ void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
|
|||||||
int pass;
|
int pass;
|
||||||
// We need an intermediate buffer between passes.
|
// We need an intermediate buffer between passes.
|
||||||
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
|
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
|
||||||
int16_t *in = input;
|
const int16_t *in = input;
|
||||||
int16_t *out = intermediate;
|
int16_t *out = intermediate;
|
||||||
// Constants
|
// Constants
|
||||||
// When we use them, in one case, they are all the same. In all others
|
// When we use them, in one case, they are all the same. In all others
|
||||||
@@ -1679,7 +1681,7 @@ void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
|
static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
|
||||||
__m128i *in1, int stride) {
|
__m128i *in1, int stride) {
|
||||||
// load first 8 columns
|
// load first 8 columns
|
||||||
load_buffer_8x8(input, in0, stride);
|
load_buffer_8x8(input, in0, stride);
|
||||||
@@ -2531,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
|
|||||||
array_transpose_16x16(in0, in1);
|
array_transpose_16x16(in0, in1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
|
void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
|
||||||
int stride, int tx_type) {
|
int stride, int tx_type) {
|
||||||
__m128i in0[16], in1[16];
|
__m128i in0[16], in1[16];
|
||||||
load_buffer_16x16(input, in0, in1, stride);
|
load_buffer_16x16(input, in0, in1, stride);
|
||||||
|
Reference in New Issue
Block a user