diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 533f7f361..20b78bfed 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -96,7 +96,7 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { } } -static void idct4_1d(const int16_t *input, int16_t *output) { +static void idct4(const int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; // stage 1 @@ -124,7 +124,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { // Rows for (i = 0; i < 4; ++i) { - idct4_1d(input, outptr); + idct4(input, outptr); input += 4; outptr += 4; } @@ -133,7 +133,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - idct4_1d(temp_in, temp_out); + idct4(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]); @@ -156,7 +156,7 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { } } -static void idct8_1d(const int16_t *input, int16_t *output) { +static void idct8(const int16_t *input, int16_t *output) { int16_t step1[8], step2[8]; int temp1, temp2; // stage 1 @@ -174,7 +174,7 @@ static void idct8_1d(const int16_t *input, int16_t *output) { step1[6] = dct_const_round_shift(temp2); // stage 2 & stage 3 - even half - idct4_1d(step1, step1); + idct4(step1, step1); // stage 2 - odd half step2[4] = step1[4] + step1[5]; @@ -209,7 +209,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows for (i = 0; i < 8; ++i) { - idct8_1d(input, outptr); + idct8(input, outptr); input += 8; outptr += 8; } @@ -218,7 +218,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_1d(temp_in, temp_out); + idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); @@ -238,7 +238,7 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void iadst4_1d(const int16_t *input, int16_t *output) { +static void iadst4(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[0]; @@ -283,10 +283,10 @@ static void iadst4_1d(const int16_t *input, int16_t *output) { void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, int tx_type) { const transform_2d IHT_4[] = { - { idct4_1d, idct4_1d }, // DCT_DCT = 0 - { iadst4_1d, idct4_1d }, // ADST_DCT = 1 - { idct4_1d, iadst4_1d }, // DCT_ADST = 2 - { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + { idct4, idct4 }, // DCT_DCT = 0 + { iadst4, idct4 }, // ADST_DCT = 1 + { idct4, iadst4 }, // DCT_ADST = 2 + { iadst4, iadst4 } // ADST_ADST = 3 }; int i, j; @@ -311,7 +311,7 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, + dest[j * stride + i]); } } -static void iadst8_1d(const int16_t *input, int16_t *output) { +static void iadst8(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[7]; @@ -389,10 +389,10 @@ static void iadst8_1d(const int16_t *input, int16_t *output) { } static const transform_2d IHT_8[] = { - { idct8_1d, idct8_1d }, // DCT_DCT = 0 - { iadst8_1d, idct8_1d }, // ADST_DCT = 1 - { idct8_1d, iadst8_1d }, // DCT_ADST = 2 - { iadst8_1d, iadst8_1d } // ADST_ADST = 3 + { idct8, idct8 }, // DCT_DCT = 0 + { iadst8, idct8 }, // ADST_DCT = 1 + { idct8, iadst8 }, // DCT_ADST = 2 + { iadst8, iadst8 } // ADST_ADST = 3 }; void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, @@ -430,7 +430,7 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows // only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { - idct8_1d(input, outptr); + idct8(input, outptr); input += 8; outptr += 8; } @@ -439,14 +439,14 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_1d(temp_in, temp_out); + idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); } } -static void idct16_1d(const int16_t *input, int16_t *output) { +static void idct16(const int16_t *input, int16_t *output) { int16_t step1[16], step2[16]; int temp1, temp2; @@ -619,7 +619,7 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows for (i = 0; i < 16; ++i) { - idct16_1d(input, outptr); + idct16(input, outptr); input += 16; outptr += 16; } @@ -628,14 +628,14 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - idct16_1d(temp_in, temp_out); + idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); } } -static void iadst16_1d(const int16_t *input, int16_t *output) { +static void iadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -807,10 +807,10 @@ static void iadst16_1d(const int16_t *input, int16_t *output) { } static const transform_2d IHT_16[] = { - { idct16_1d, idct16_1d }, // DCT_DCT = 0 - { iadst16_1d, idct16_1d }, // ADST_DCT = 1 - { idct16_1d, iadst16_1d }, // DCT_ADST = 2 - { iadst16_1d, iadst16_1d } // ADST_ADST = 3 + { idct16, idct16 }, // DCT_DCT = 0 + { iadst16, idct16 }, // ADST_DCT = 1 + { idct16, iadst16 }, // DCT_ADST = 2 + { iadst16, iadst16 } // ADST_ADST = 3 }; void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, @@ -848,7 +848,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { - idct16_1d(input, outptr); + idct16(input, outptr); input += 16; outptr += 16; } @@ -857,7 +857,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j*16 + i]; - idct16_1d(temp_in, temp_out); + idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -877,7 +877,7 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void idct32_1d(const int16_t *input, int16_t *output) { +static void idct32(const int16_t *input, int16_t *output) { int16_t step1[32], step2[32]; int temp1, temp2; @@ -1263,7 +1263,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; if (zero_coeff[0] | zero_coeff[1]) - idct32_1d(input, outptr); + idct32(input, outptr); else vpx_memset(outptr, 0, sizeof(int16_t) * 32); input += 32; @@ -1274,7 +1274,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); + idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -1290,7 +1290,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { // Rows // only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { - idct32_1d(input, outptr); + idct32(input, outptr); input += 32; outptr += 32; } @@ -1299,7 +1299,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); + idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 2f6149464..13a5b5a82 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -180,7 +180,7 @@ static INLINE void transpose_4x4(__m128i *res) { res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); } -static void idct4_1d_sse2(__m128i *in) { +static void idct4_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -216,7 +216,7 @@ static void idct4_1d_sse2(__m128i *in) { in[1] = _mm_shuffle_epi32(in[1], 0x4E); } -static void iadst4_1d_sse2(__m128i *in) { +static void iadst4_sse2(__m128i *in) { const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); @@ -276,20 +276,20 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct4_1d_sse2(in); - idct4_1d_sse2(in); + idct4_sse2(in); + idct4_sse2(in); break; case 1: // ADST_DCT - idct4_1d_sse2(in); - iadst4_1d_sse2(in); + idct4_sse2(in); + iadst4_sse2(in); break; case 2: // DCT_ADST - iadst4_1d_sse2(in); - idct4_1d_sse2(in); + iadst4_sse2(in); + idct4_sse2(in); break; case 3: // ADST_ADST - iadst4_1d_sse2(in); - iadst4_1d_sse2(in); + iadst4_sse2(in); + iadst4_sse2(in); break; default: assert(0); @@ -455,7 +455,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, res1 = _mm_packs_epi32(tmp2, tmp3); \ } -#define IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7) \ { \ /* Stage1 */ \ @@ -573,7 +573,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); } @@ -674,7 +674,7 @@ static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); } -static void idct8_1d_sse2(__m128i *in) { +static void idct8_sse2(__m128i *in) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); @@ -695,11 +695,11 @@ static void idct8_1d_sse2(__m128i *in) { in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8_1D(in0, in1, in2, in3, in4, in5, in6, in7, + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); } -static void iadst8_1d_sse2(__m128i *in) { +static void iadst8_sse2(__m128i *in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -946,20 +946,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct8_1d_sse2(in); - idct8_1d_sse2(in); + idct8_sse2(in); + idct8_sse2(in); break; case 1: // ADST_DCT - idct8_1d_sse2(in); - iadst8_1d_sse2(in); + idct8_sse2(in); + iadst8_sse2(in); break; case 2: // DCT_ADST - iadst8_1d_sse2(in); - idct8_1d_sse2(in); + iadst8_sse2(in); + idct8_sse2(in); break; case 3: // ADST_ADST - iadst8_1d_sse2(in); - iadst8_1d_sse2(in); + iadst8_sse2(in); + iadst8_sse2(in); break; default: assert(0); @@ -1104,7 +1104,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - IDCT8_1D(in0, in1, in2, in3, zero, zero, zero, zero, + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, in5, in6, in7); // Final rounding and shift in0 = _mm_adds_epi16(in0, final_rounding); @@ -1135,7 +1135,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -#define IDCT16_1D \ +#define IDCT16 \ /* Stage2 */ \ { \ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ @@ -1264,7 +1264,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { stp2_10, stp2_13, stp2_11, stp2_12) \ } -#define IDCT16_10_1D \ +#define IDCT16_10 \ /* Stage2 */ \ { \ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ @@ -1437,7 +1437,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, array_transpose_8x8(in, in); array_transpose_8x8(in+8, in+8); - IDCT16_1D + IDCT16 // Stage7 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); @@ -1465,7 +1465,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, array_transpose_8x8(l+i*8, in); array_transpose_8x8(r+i*8, in+8); - IDCT16_1D + IDCT16 // 2-D in[0] = _mm_add_epi16(stp2_0, stp1_15); @@ -1590,7 +1590,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { res0[15] = tbuf[7]; } -static void iadst16_1d_8col(__m128i *in) { +static void iadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -2060,7 +2060,7 @@ static void iadst16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_1d_8col(__m128i *in) { +static void idct16_8col(__m128i *in) { const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); @@ -2404,16 +2404,16 @@ static void idct16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(s[0], s[15]); } -static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { +static void idct16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); - idct16_1d_8col(in0); - idct16_1d_8col(in1); + idct16_8col(in0); + idct16_8col(in1); } -static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { +static void iadst16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); - iadst16_1d_8col(in0); - iadst16_1d_8col(in1); + iadst16_8col(in0); + iadst16_8col(in1); } static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { @@ -2502,20 +2502,20 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct16_1d_sse2(in0, in1); - idct16_1d_sse2(in0, in1); + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); break; case 1: // ADST_DCT - idct16_1d_sse2(in0, in1); - iadst16_1d_sse2(in0, in1); + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); break; case 2: // DCT_ADST - iadst16_1d_sse2(in0, in1); - idct16_1d_sse2(in0, in1); + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); break; case 3: // ADST_ADST - iadst16_1d_sse2(in0, in1); - iadst16_1d_sse2(in0, in1); + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); break; default: assert(0); @@ -2732,7 +2732,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, for (i = 0; i < 2; i++) { array_transpose_4X8(l + 8*i, in); - IDCT16_10_1D + IDCT16_10 // Stage7 in[0] = _mm_add_epi16(stp2_0, stp1_15); @@ -2814,7 +2814,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, input += 8; \ } \ -#define IDCT32_1D_34 \ +#define IDCT32_34 \ /* Stage1 */ \ { \ const __m128i zero = _mm_setzero_si128();\ @@ -3115,7 +3115,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, } -#define IDCT32_1D \ +#define IDCT32 \ /* Stage1 */ \ { \ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ @@ -3554,7 +3554,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, array_transpose_8x8(in+16, in+16); array_transpose_8x8(in+24, in+24); - IDCT32_1D + IDCT32 // 1_D: Store 32 intermediate results for each 8x32 block. col[0] = _mm_add_epi16(stp1_0, stp1_31); @@ -3593,7 +3593,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, const __m128i zero = _mm_setzero_si128(); // Transpose 32x8 block to 8x32 block array_transpose_8x8(col+i*8, in); - IDCT32_1D_34 + IDCT32_34 // 2_D: Calculate the results and store them to destination. in[0] = _mm_add_epi16(stp1_0, stp1_31); @@ -3922,7 +3922,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, array_transpose_8x8(in+16, in+16); array_transpose_8x8(in+24, in+24); - IDCT32_1D + IDCT32 // 1_D: Store 32 intermediate results for each 8x32 block. col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); @@ -3969,7 +3969,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, array_transpose_8x8(col+j+64, in+16); array_transpose_8x8(col+j+96, in+24); - IDCT32_1D + IDCT32 // 2_D: Calculate the results and store them to destination. in[0] = _mm_add_epi16(stp1_0, stp1_31); diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 0f4a6bb63..a840b480a 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -997,7 +997,7 @@ static INLINE int half_round_shift(int input) { return rv; } -static void dct32_1d(const int *input, int *output, int round) { +static void fdct32(const int *input, int *output, int round) { int step[32]; // Stage 1 step[0] = input[0] + input[(32 - 1)]; @@ -1329,7 +1329,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - dct32_1d(temp_in, temp_out, 0); + fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } @@ -1339,13 +1339,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - dct32_1d(temp_in, temp_out, 0); + fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } } -// Note that although we use dct_32_round in dct32_1d computation flow, +// Note that although we use dct_32_round in dct32 computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { @@ -1357,7 +1357,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - dct32_1d(temp_in, temp_out, 0); + fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) // TODO(cd): see quality impact of only doing // output[j * 32 + i] = (temp_out[j] + 1) >> 2; @@ -1370,7 +1370,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - dct32_1d(temp_in, temp_out, 1); + fdct32(temp_in, temp_out, 1); for (j = 0; j < 32; ++j) out[j + i * 32] = temp_out[j]; } diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c index d81b72bba..ea031fb07 100644 --- a/vp9/encoder/x86/vp9_dct_avx2.c +++ b/vp9/encoder/x86/vp9_dct_avx2.c @@ -163,7 +163,7 @@ static INLINE void transpose_4x4_avx2(__m128i *res) { res[3] = _mm_unpackhi_epi64(res[2], res[2]); } -void fdct4_1d_avx2(__m128i *in) { +void fdct4_avx2(__m128i *in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); @@ -196,7 +196,7 @@ void fdct4_1d_avx2(__m128i *in) { transpose_4x4_avx2(in); } -void fadst4_1d_avx2(__m128i *in) { +void fadst4_avx2(__m128i *in) { const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); @@ -250,20 +250,20 @@ void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output, load_buffer_4x4_avx2(input, in, stride); switch (tx_type) { case 0: // DCT_DCT - fdct4_1d_avx2(in); - fdct4_1d_avx2(in); + fdct4_avx2(in); + fdct4_avx2(in); break; case 1: // ADST_DCT - fadst4_1d_avx2(in); - fdct4_1d_avx2(in); + fadst4_avx2(in); + fdct4_avx2(in); break; case 2: // DCT_ADST - fdct4_1d_avx2(in); - fadst4_1d_avx2(in); + fdct4_avx2(in); + fadst4_avx2(in); break; case 3: // ADST_ADST - fadst4_1d_avx2(in); - fadst4_1d_avx2(in); + fadst4_avx2(in); + fadst4_avx2(in); break; default: assert(0); @@ -658,7 +658,7 @@ static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) { // 07 17 27 37 47 57 67 77 } -void fdct8_1d_avx2(__m128i *in) { +void fdct8_avx2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); @@ -798,7 +798,7 @@ void fdct8_1d_avx2(__m128i *in) { array_transpose_8x8_avx2(in, in); } -void fadst8_1d_avx2(__m128i *in) { +void fadst8_avx2(__m128i *in) { // Constants const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); @@ -1034,20 +1034,20 @@ void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output, load_buffer_8x8_avx2(input, in, stride); switch (tx_type) { case 0: // DCT_DCT - fdct8_1d_avx2(in); - fdct8_1d_avx2(in); + fdct8_avx2(in); + fdct8_avx2(in); break; case 1: // ADST_DCT - fadst8_1d_avx2(in); - fdct8_1d_avx2(in); + fadst8_avx2(in); + fdct8_avx2(in); break; case 2: // DCT_ADST - fdct8_1d_avx2(in); - fadst8_1d_avx2(in); + fdct8_avx2(in); + fadst8_avx2(in); break; case 3: // ADST_ADST - fadst8_1d_avx2(in); - fadst8_1d_avx2(in); + fadst8_avx2(in); + fadst8_avx2(in); break; default: assert(0); @@ -1216,7 +1216,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } - // Work on the first eight values; fdct8_1d(input, even_results); + // Work on the first eight values; fdct8(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); @@ -1730,7 +1730,7 @@ static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) { right_shift_8x8_avx2(res1 + 8, 2); } -void fdct16_1d_8col_avx2(__m128i *in) { +void fdct16_8col_avx2(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); @@ -2052,7 +2052,7 @@ void fdct16_1d_8col_avx2(__m128i *in) { in[15] = _mm_packs_epi32(v[14], v[15]); } -void fadst16_1d_8col_avx2(__m128i *in) { +void fadst16_8col_avx2(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -2522,15 +2522,15 @@ void fadst16_1d_8col_avx2(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -void fdct16_1d_avx2(__m128i *in0, __m128i *in1) { - fdct16_1d_8col_avx2(in0); - fdct16_1d_8col_avx2(in1); +void fdct16_avx2(__m128i *in0, __m128i *in1) { + fdct16_8col_avx2(in0); + fdct16_8col_avx2(in1); array_transpose_16x16_avx2(in0, in1); } -void fadst16_1d_avx2(__m128i *in0, __m128i *in1) { - fadst16_1d_8col_avx2(in0); - fadst16_1d_8col_avx2(in1); +void fadst16_avx2(__m128i *in0, __m128i *in1) { + fadst16_8col_avx2(in0); + fadst16_8col_avx2(in1); array_transpose_16x16_avx2(in0, in1); } @@ -2540,24 +2540,24 @@ void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output, load_buffer_16x16_avx2(input, in0, in1, stride); switch (tx_type) { case 0: // DCT_DCT - fdct16_1d_avx2(in0, in1); + fdct16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); - fdct16_1d_avx2(in0, in1); + fdct16_avx2(in0, in1); break; case 1: // ADST_DCT - fadst16_1d_avx2(in0, in1); + fadst16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); - fdct16_1d_avx2(in0, in1); + fdct16_avx2(in0, in1); break; case 2: // DCT_ADST - fdct16_1d_avx2(in0, in1); + fdct16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); - fadst16_1d_avx2(in0, in1); + fadst16_avx2(in0, in1); break; case 3: // ADST_ADST - fadst16_1d_avx2(in0, in1); + fadst16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); - fadst16_1d_avx2(in0, in1); + fadst16_avx2(in0, in1); break; default: assert(0); diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index 65431bdbf..c876cc273 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -161,7 +161,7 @@ static INLINE void transpose_4x4(__m128i *res) { res[3] = _mm_unpackhi_epi64(res[2], res[2]); } -void fdct4_1d_sse2(__m128i *in) { +void fdct4_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); @@ -194,7 +194,7 @@ void fdct4_1d_sse2(__m128i *in) { transpose_4x4(in); } -void fadst4_1d_sse2(__m128i *in) { +void fadst4_sse2(__m128i *in) { const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); @@ -248,20 +248,20 @@ void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, load_buffer_4x4(input, in, stride); switch (tx_type) { case 0: // DCT_DCT - fdct4_1d_sse2(in); - fdct4_1d_sse2(in); + fdct4_sse2(in); + fdct4_sse2(in); break; case 1: // ADST_DCT - fadst4_1d_sse2(in); - fdct4_1d_sse2(in); + fadst4_sse2(in); + fdct4_sse2(in); break; case 2: // DCT_ADST - fdct4_1d_sse2(in); - fadst4_1d_sse2(in); + fdct4_sse2(in); + fadst4_sse2(in); break; case 3: // ADST_ADST - fadst4_1d_sse2(in); - fadst4_1d_sse2(in); + fadst4_sse2(in); + fadst4_sse2(in); break; default: assert(0); @@ -656,7 +656,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { // 07 17 27 37 47 57 67 77 } -void fdct8_1d_sse2(__m128i *in) { +void fdct8_sse2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); @@ -796,7 +796,7 @@ void fdct8_1d_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void fadst8_1d_sse2(__m128i *in) { +void fadst8_sse2(__m128i *in) { // Constants const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); @@ -1032,20 +1032,20 @@ void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, load_buffer_8x8(input, in, stride); switch (tx_type) { case 0: // DCT_DCT - fdct8_1d_sse2(in); - fdct8_1d_sse2(in); + fdct8_sse2(in); + fdct8_sse2(in); break; case 1: // ADST_DCT - fadst8_1d_sse2(in); - fdct8_1d_sse2(in); + fadst8_sse2(in); + fdct8_sse2(in); break; case 2: // DCT_ADST - fdct8_1d_sse2(in); - fadst8_1d_sse2(in); + fdct8_sse2(in); + fadst8_sse2(in); break; case 3: // ADST_ADST - fadst8_1d_sse2(in); - fadst8_1d_sse2(in); + fadst8_sse2(in); + fadst8_sse2(in); break; default: assert(0); @@ -1214,7 +1214,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } - // Work on the first eight values; fdct8_1d(input, even_results); + // Work on the first eight values; fdct8(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); @@ -1728,7 +1728,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { right_shift_8x8(res1 + 8, 2); } -void fdct16_1d_8col(__m128i *in) { +void fdct16_8col(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); @@ -2050,7 +2050,7 @@ void fdct16_1d_8col(__m128i *in) { in[15] = _mm_packs_epi32(v[14], v[15]); } -void fadst16_1d_8col(__m128i *in) { +void fadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -2520,15 +2520,15 @@ void fadst16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -void fdct16_1d_sse2(__m128i *in0, __m128i *in1) { - fdct16_1d_8col(in0); - fdct16_1d_8col(in1); +void fdct16_sse2(__m128i *in0, __m128i *in1) { + fdct16_8col(in0); + fdct16_8col(in1); array_transpose_16x16(in0, in1); } -void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { - fadst16_1d_8col(in0); - fadst16_1d_8col(in1); +void fadst16_sse2(__m128i *in0, __m128i *in1) { + fadst16_8col(in0); + fadst16_8col(in1); array_transpose_16x16(in0, in1); } @@ -2538,24 +2538,24 @@ void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, load_buffer_16x16(input, in0, in1, stride); switch (tx_type) { case 0: // DCT_DCT - fdct16_1d_sse2(in0, in1); + fdct16_sse2(in0, in1); right_shift_16x16(in0, in1); - fdct16_1d_sse2(in0, in1); + fdct16_sse2(in0, in1); break; case 1: // ADST_DCT - fadst16_1d_sse2(in0, in1); + fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); - fdct16_1d_sse2(in0, in1); + fdct16_sse2(in0, in1); break; case 2: // DCT_ADST - fdct16_1d_sse2(in0, in1); + fdct16_sse2(in0, in1); right_shift_16x16(in0, in1); - fadst16_1d_sse2(in0, in1); + fadst16_sse2(in0, in1); break; case 3: // ADST_ADST - fadst16_1d_sse2(in0, in1); + fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); - fadst16_1d_sse2(in0, in1); + fadst16_sse2(in0, in1); break; default: assert(0);