From 600a3860a4d3d7567eda9d81e3fc00de0f0f479c Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Thu, 24 Oct 2013 11:48:25 -0700 Subject: [PATCH] Making input pointer constant for all fdct/fht functions. Change-Id: I78f7012f967a777ddd39bae6671eb501df6bbfe8 --- test/dct16x16_test.cc | 13 +++++----- test/dct32x32_test.cc | 4 +-- test/fdct8x8_test.cc | 15 ++++++------ vp9/common/vp9_rtcd_defs.sh | 18 +++++++------- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_dct.c | 36 +++++++++++++-------------- vp9/encoder/x86/vp9_dct32x32_sse2.c | 20 +++++++-------- vp9/encoder/x86/vp9_dct_sse2.c | 38 +++++++++++++++-------------- 8 files changed, 75 insertions(+), 71 deletions(-) diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 451aa6038..b61df8d0d 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -257,17 +257,18 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) { } } -typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride); -typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride); -typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type); -typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride, +typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride); +typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride); +typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride, + int tx_type); +typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride, int tx_type); -void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { vp9_fdct16x16_c(in, out, stride); } -void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) { +void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { vp9_short_fht16x16_c(in, out, stride, tx_type); } diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index de3a438bf..1e792da8d 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -74,8 +74,8 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], } } -typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride); -typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *dst, int stride); +typedef void (*fwd_txfm_t)(const int16_t *in, int16_t *out, int stride); +typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *out, int stride); class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) { public: diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 863350382..3777b1151 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -28,17 +28,18 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch); using libvpx_test::ACMRandom; namespace { -typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride); -typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride); -typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type); -typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride, - int tx_type); +typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride); +typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride); +typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride, + int tx_type); +typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride, + int tx_type); -void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) { +void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { vp9_fdct8x8_c(in, out, stride); } -void fht8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) { +void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) { vp9_short_fht8x8_c(in, out, stride, tx_type); } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 60636eee0..ba96e5ad6 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -686,31 +686,31 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then fi # fdct functions -prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht4x4 sse2 -prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht8x8 sse2 -prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht16x16 sse2 -prototype void vp9_fwht4x4 "int16_t *input, int16_t *output, int stride" +prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" specialize vp9_fwht4x4 -prototype void vp9_fdct4x4 "int16_t *input, int16_t *output, int stride" +prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride" specialize vp9_fdct4x4 sse2 -prototype void vp9_fdct8x8 "int16_t *input, int16_t *output, int stride" +prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride" specialize vp9_fdct8x8 sse2 -prototype void vp9_fdct16x16 "int16_t *input, int16_t *output, int stride" +prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride" specialize vp9_fdct16x16 sse2 -prototype void vp9_fdct32x32 "int16_t *input, int16_t *output, int stride" +prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride" specialize vp9_fdct32x32 sse2 -prototype void vp9_fdct32x32_rd "int16_t *input, int16_t *output, int stride" +prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride" specialize vp9_fdct32x32_rd sse2 # diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 12dad0311..db2564b4a 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -173,7 +173,7 @@ struct macroblock { BLOCK_SIZE sb_partitioning[4]; BLOCK_SIZE sb64_partitioning; - void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); }; // TODO(jingning): the variables used here are little complicated. need further diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 94fcf9101..0a0afedfd 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -36,7 +36,7 @@ static void fdct4(const int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(temp2); } -void vp9_fdct4x4_c(int16_t *input, int16_t *output, int stride) { +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -46,7 +46,7 @@ void vp9_fdct4x4_c(int16_t *input, int16_t *output, int stride) { int pass; // We need an intermediate buffer between passes. int16_t intermediate[4 * 4]; - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { @@ -148,8 +148,8 @@ static const transform_2d FHT_4[] = { { fadst4, fadst4 } // ADST_ADST = 3 }; -void vp9_short_fht4x4_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, + int stride, TX_TYPE tx_type) { int16_t out[4 * 4]; int16_t *outptr = &out[0]; int i, j; @@ -159,7 +159,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) - temp_in[j] = input[j * pitch + i] * 16; + temp_in[j] = input[j * stride + i] * 16; if (i == 0 && temp_in[0]) temp_in[0] += 1; ht.cols(temp_in, temp_out); @@ -229,7 +229,7 @@ static void fdct8(const int16_t *input, int16_t *output) { output[7] = dct_const_round_shift(t3); } -void vp9_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) { +void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { int i, j; int16_t intermediate[64]; @@ -300,7 +300,7 @@ void vp9_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) { } } -void vp9_fdct16x16_c(int16_t *input, int16_t *output, int stride) { +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -310,7 +310,7 @@ void vp9_fdct16x16_c(int16_t *input, int16_t *output, int stride) { int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { @@ -556,8 +556,8 @@ static const transform_2d FHT_8[] = { { fadst8, fadst8 } // ADST_ADST = 3 }; -void vp9_short_fht8x8_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, + int stride, TX_TYPE tx_type) { int16_t out[64]; int16_t *outptr = &out[0]; int i, j; @@ -567,7 +567,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) - temp_in[j] = input[j * pitch + i] * 4; + temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) outptr[j * 8 + i] = temp_out[j]; @@ -585,10 +585,10 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ -void vp9_fwht4x4_c(int16_t *input, int16_t *output, int stride) { +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { int i; int a1, b1, c1, d1, e1; - int16_t *ip = input; + const int16_t *ip = input; int16_t *op = output; for (i = 0; i < 4; i++) { @@ -949,8 +949,8 @@ static const transform_2d FHT_16[] = { { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_short_fht16x16_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, + int stride, TX_TYPE tx_type) { int16_t out[256]; int16_t *outptr = &out[0]; int i, j; @@ -960,7 +960,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) - temp_in[j] = input[j * pitch + i] * 4; + temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; @@ -1311,7 +1311,7 @@ static void dct32_1d(const int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vp9_fdct32x32_c(int16_t *input, int16_t *out, int stride) { +void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1339,7 +1339,7 @@ void vp9_fdct32x32_c(int16_t *input, int16_t *out, int stride) { // Note that although we use dct_32_round in dct32_1d computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vp9_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) { +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c index de47a5bf1..2d59775ce 100644 --- a/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -29,7 +29,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { } #endif -void FDCT32x32_2D(int16_t *input, +void FDCT32x32_2D(const int16_t *input, int16_t *output_org, int stride) { // Calculate pre-multiplied strides const int str1 = stride; @@ -93,13 +93,13 @@ void FDCT32x32_2D(int16_t *input, // Note: even though all the loads below are aligned, using the aligned // intrinsic make the code slightly slower. if (0 == pass) { - int16_t *in = &input[column_start]; + const int16_t *in = &input[column_start]; // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - int16_t *ina = in + 0 * str1; - int16_t *inb = in + 31 * str1; + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; __m128i *step1a = &step1[ 0]; __m128i *step1b = &step1[31]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -128,8 +128,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 4 * str1; - int16_t *inb = in + 27 * str1; + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; __m128i *step1a = &step1[ 4]; __m128i *step1b = &step1[27]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -158,8 +158,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 8 * str1; - int16_t *inb = in + 23 * str1; + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; __m128i *step1a = &step1[ 8]; __m128i *step1b = &step1[23]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -188,8 +188,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 12 * str1; - int16_t *inb = in + 19 * str1; + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; __m128i *step1a = &step1[12]; __m128i *step1b = &step1[19]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index 25b9e7e46..dc115018e 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,7 +12,7 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" -void vp9_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) { +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -111,7 +111,8 @@ void vp9_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) { } } -static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); __m128i mask; @@ -242,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) { transpose_4x4(in); } -void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, +void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in[4]; load_buffer_4x4(input, in, stride); @@ -270,7 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, write_buffer_4x4(output, in); } -void vp9_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) { +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { int pass; // Constants // When we use them, in one case, they are all the same. In all others @@ -527,15 +528,16 @@ void vp9_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) { } // load 8x8 array -static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { - in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); in[0] = _mm_slli_epi16(in[0], 2); in[1] = _mm_slli_epi16(in[1], 2); @@ -1025,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, +void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in[8]; load_buffer_8x8(input, in, stride); @@ -1054,7 +1056,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, write_buffer_8x8(output, in, 8); } -void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) { +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -1064,7 +1066,7 @@ void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) { int pass; // We need an intermediate buffer between passes. DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others @@ -1679,7 +1681,7 @@ void vp9_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) { } } -static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, +static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, __m128i *in1, int stride) { // load first 8 columns load_buffer_8x8(input, in0, stride); @@ -2531,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); } -void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, +void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in0[16], in1[16]; load_buffer_16x16(input, in0, in1, stride);