diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index 3d1a43dc8..c5d82d9ed 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -106,20 +106,20 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { __m128i temp1[2], sign[2]; // stage 2 - highbd_multiplication_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64, - &step2[8], &step2[15]); - highbd_multiplication_neg_sse2(io[7], (int)cospi_14_64, (int)cospi_18_64, - &step2[9], &step2[14]); - highbd_multiplication_sse2(io[5], (int)cospi_22_64, (int)cospi_10_64, - &step2[10], &step2[13]); - highbd_multiplication_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64, - &step2[11], &step2[12]); + highbd_partial_butterfly_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64, + &step2[8], &step2[15]); + highbd_partial_butterfly_neg_sse2(io[7], (int)cospi_14_64, (int)cospi_18_64, + &step2[9], &step2[14]); + highbd_partial_butterfly_sse2(io[5], (int)cospi_22_64, (int)cospi_10_64, + &step2[10], &step2[13]); + highbd_partial_butterfly_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64, + &step2[11], &step2[12]); // stage 3 - highbd_multiplication_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64, - &step1[4], &step1[7]); - highbd_multiplication_neg_sse2(io[6], (int)cospi_12_64, (int)cospi_20_64, - &step1[5], &step1[6]); + highbd_partial_butterfly_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64, + &step1[4], &step1[7]); + highbd_partial_butterfly_neg_sse2(io[6], (int)cospi_12_64, (int)cospi_20_64, + &step1[5], &step1[6]); step1[8] = _mm_add_epi32(step2[8], step2[9]); step1[9] = _mm_sub_epi32(step2[8], step2[9]); step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] @@ -133,8 +133,8 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { abs_extend_64bit_sse2(io[0], temp1, sign); step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); step2[1] = step2[0]; - highbd_multiplication_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64, - &step2[2], &step2[3]); + highbd_partial_butterfly_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64, + &step2[2], &step2[3]); highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64, &step2[9], &step2[14]); highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64, @@ -158,14 +158,14 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { __m128i temp[2], sign[2]; // stage 2 - highbd_multiplication_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64, - &step2[8], &step2[15]); - highbd_multiplication_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64, - &step2[11], &step2[12]); + highbd_partial_butterfly_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64, + &step2[8], &step2[15]); + highbd_partial_butterfly_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64, + &step2[11], &step2[12]); // stage 3 - highbd_multiplication_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64, - &step1[4], &step1[7]); + highbd_partial_butterfly_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64, + &step1[4], &step1[7]); step1[8] = step2[8]; step1[9] = step2[8]; step1[10] = @@ -209,25 +209,8 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, in = l; for (i = 0; i < 2; i++) { - in[0] = load_pack_8_32bit(input + 0 * 16); - in[1] = load_pack_8_32bit(input + 1 * 16); - in[2] = load_pack_8_32bit(input + 2 * 16); - in[3] = load_pack_8_32bit(input + 3 * 16); - in[4] = load_pack_8_32bit(input + 4 * 16); - in[5] = load_pack_8_32bit(input + 5 * 16); - in[6] = load_pack_8_32bit(input + 6 * 16); - in[7] = load_pack_8_32bit(input + 7 * 16); - transpose_16bit_8x8(in, in); - - in[8] = load_pack_8_32bit(input + 0 * 16 + 8); - in[9] = load_pack_8_32bit(input + 1 * 16 + 8); - in[10] = load_pack_8_32bit(input + 2 * 16 + 8); - in[11] = load_pack_8_32bit(input + 3 * 16 + 8); - in[12] = load_pack_8_32bit(input + 4 * 16 + 8); - in[13] = load_pack_8_32bit(input + 5 * 16 + 8); - in[14] = load_pack_8_32bit(input + 6 * 16 + 8); - in[15] = load_pack_8_32bit(input + 7 * 16 + 8); - transpose_16bit_8x8(in + 8, in + 8); + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); idct16_8col(in, in); in = r; input += 128; @@ -249,52 +232,18 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, for (i = 0; i < 4; i++) { in = all[i]; - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0)); - in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4)); - in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0)); - in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4)); - in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0)); - in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4)); - in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0)); - in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4)); - transpose_32bit_8x4(in, in); - - in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8)); - in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12)); - in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8)); - in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12)); - in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8)); - in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12)); - in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8)); - in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12)); - transpose_32bit_8x4(in + 8, in + 8); - + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); highbd_idct16_4col(in); input += 4 * 16; } for (i = 0; i < 16; i += 4) { int j; - out[0] = all[0][i + 0]; - out[1] = all[1][i + 0]; - out[2] = all[0][i + 1]; - out[3] = all[1][i + 1]; - out[4] = all[0][i + 2]; - out[5] = all[1][i + 2]; - out[6] = all[0][i + 3]; - out[7] = all[1][i + 3]; - transpose_32bit_8x4(out, out); - - out[8] = all[2][i + 0]; - out[9] = all[3][i + 0]; - out[10] = all[2][i + 1]; - out[11] = all[3][i + 1]; - out[12] = all[2][i + 2]; - out[13] = all[3][i + 2]; - out[14] = all[2][i + 3]; - out[15] = all[3][i + 3]; - transpose_32bit_8x4(out + 8, out + 8); - + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); highbd_idct16_4col(out); for (j = 0; j < 16; ++j) { @@ -313,16 +262,7 @@ void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, if (bd == 8) { __m128i in[16], temp[16]; - in[0] = load_pack_8_32bit(input + 0 * 16); - in[1] = load_pack_8_32bit(input + 1 * 16); - in[2] = load_pack_8_32bit(input + 2 * 16); - in[3] = load_pack_8_32bit(input + 3 * 16); - in[4] = load_pack_8_32bit(input + 4 * 16); - in[5] = load_pack_8_32bit(input + 5 * 16); - in[6] = load_pack_8_32bit(input + 6 * 16); - in[7] = load_pack_8_32bit(input + 7 * 16); - transpose_16bit_8x8(in, in); - + highbd_load_pack_transpose_32bit_8x8(input, 16, in); for (i = 8; i < 16; i++) { in[i] = _mm_setzero_si128(); } @@ -343,30 +283,15 @@ void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, for (i = 0; i < 2; i++) { in = all[i]; - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0)); - in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4)); - in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0)); - in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4)); - in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0)); - in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4)); - in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0)); - in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4)); - transpose_32bit_8x4(in, in); + highbd_load_transpose_32bit_8x4(input, 16, in); highbd_idct16x16_38_4col(in); input += 4 * 16; } for (i = 0; i < 16; i += 4) { int j; - out[0] = all[0][i + 0]; - out[1] = all[1][i + 0]; - out[2] = all[0][i + 1]; - out[3] = all[1][i + 1]; - out[4] = all[0][i + 2]; - out[5] = all[1][i + 2]; - out[6] = all[0][i + 3]; - out[7] = all[1][i + 3]; - transpose_32bit_8x4(out, out); + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); highbd_idct16x16_38_4col(out); for (j = 0; j < 16; ++j) { @@ -406,11 +331,7 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, for (i = 0; i < 2; i++) { in = all[i]; - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - transpose_32bit_4x4(in, in); + highbd_load_transpose_32bit_4x4(input, 16, in); highbd_idct16x16_10_4col(in); input += 4 * 16; } diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c index 6f5f66394..4c5452cc4 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -107,20 +107,20 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { __m128i temp1[2]; // stage 2 - highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64, - &step2[8], &step2[15]); - highbd_multiplication_sse4_1(io[7], -(int)cospi_18_64, (int)cospi_14_64, - &step2[9], &step2[14]); - highbd_multiplication_sse4_1(io[5], (int)cospi_22_64, (int)cospi_10_64, - &step2[10], &step2[13]); - highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64, - &step2[11], &step2[12]); + highbd_partial_butterfly_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64, + &step2[8], &step2[15]); + highbd_partial_butterfly_sse4_1(io[7], -(int)cospi_18_64, (int)cospi_14_64, + &step2[9], &step2[14]); + highbd_partial_butterfly_sse4_1(io[5], (int)cospi_22_64, (int)cospi_10_64, + &step2[10], &step2[13]); + highbd_partial_butterfly_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64, + &step2[11], &step2[12]); // stage 3 - highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64, - &step1[4], &step1[7]); - highbd_multiplication_sse4_1(io[6], -(int)cospi_20_64, (int)cospi_12_64, - &step1[5], &step1[6]); + highbd_partial_butterfly_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64, + &step1[4], &step1[7]); + highbd_partial_butterfly_sse4_1(io[6], -(int)cospi_20_64, (int)cospi_12_64, + &step1[5], &step1[6]); step1[8] = _mm_add_epi32(step2[8], step2[9]); step1[9] = _mm_sub_epi32(step2[8], step2[9]); step1[10] = _mm_sub_epi32(step2[11], step2[10]); @@ -134,8 +134,8 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { extend_64bit(io[0], temp1); step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); step2[1] = step2[0]; - highbd_multiplication_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64, - &step2[2], &step2[3]); + highbd_partial_butterfly_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64, + &step2[2], &step2[3]); highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64, &step2[9], &step2[14]); highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64, @@ -159,14 +159,14 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { __m128i temp[2]; // stage 2 - highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64, - &step2[8], &step2[15]); - highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64, - &step2[11], &step2[12]); + highbd_partial_butterfly_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64, + &step2[8], &step2[15]); + highbd_partial_butterfly_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64, + &step2[11], &step2[12]); // stage 3 - highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64, - &step1[4], &step1[7]); + highbd_partial_butterfly_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64, + &step1[4], &step1[7]); step1[8] = step2[8]; step1[9] = step2[8]; step1[10] = step2[11]; @@ -208,25 +208,8 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, in = l; for (i = 0; i < 2; i++) { - in[0] = load_pack_8_32bit(input + 0 * 16); - in[1] = load_pack_8_32bit(input + 1 * 16); - in[2] = load_pack_8_32bit(input + 2 * 16); - in[3] = load_pack_8_32bit(input + 3 * 16); - in[4] = load_pack_8_32bit(input + 4 * 16); - in[5] = load_pack_8_32bit(input + 5 * 16); - in[6] = load_pack_8_32bit(input + 6 * 16); - in[7] = load_pack_8_32bit(input + 7 * 16); - transpose_16bit_8x8(in, in); - - in[8] = load_pack_8_32bit(input + 0 * 16 + 8); - in[9] = load_pack_8_32bit(input + 1 * 16 + 8); - in[10] = load_pack_8_32bit(input + 2 * 16 + 8); - in[11] = load_pack_8_32bit(input + 3 * 16 + 8); - in[12] = load_pack_8_32bit(input + 4 * 16 + 8); - in[13] = load_pack_8_32bit(input + 5 * 16 + 8); - in[14] = load_pack_8_32bit(input + 6 * 16 + 8); - in[15] = load_pack_8_32bit(input + 7 * 16 + 8); - transpose_16bit_8x8(in + 8, in + 8); + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); idct16_8col(in, in); in = r; input += 128; @@ -248,52 +231,18 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, for (i = 0; i < 4; i++) { in = all[i]; - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0)); - in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4)); - in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0)); - in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4)); - in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0)); - in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4)); - in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0)); - in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4)); - transpose_32bit_8x4(in, in); - - in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8)); - in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12)); - in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8)); - in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12)); - in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8)); - in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12)); - in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8)); - in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12)); - transpose_32bit_8x4(in + 8, in + 8); - + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); highbd_idct16_4col(in); input += 4 * 16; } for (i = 0; i < 16; i += 4) { int j; - out[0] = all[0][i + 0]; - out[1] = all[1][i + 0]; - out[2] = all[0][i + 1]; - out[3] = all[1][i + 1]; - out[4] = all[0][i + 2]; - out[5] = all[1][i + 2]; - out[6] = all[0][i + 3]; - out[7] = all[1][i + 3]; - transpose_32bit_8x4(out, out); - - out[8] = all[2][i + 0]; - out[9] = all[3][i + 0]; - out[10] = all[2][i + 1]; - out[11] = all[3][i + 1]; - out[12] = all[2][i + 2]; - out[13] = all[3][i + 2]; - out[14] = all[2][i + 3]; - out[15] = all[3][i + 3]; - transpose_32bit_8x4(out + 8, out + 8); - + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); highbd_idct16_4col(out); for (j = 0; j < 16; ++j) { @@ -312,16 +261,7 @@ void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, if (bd == 8) { __m128i in[16], temp[16]; - in[0] = load_pack_8_32bit(input + 0 * 16); - in[1] = load_pack_8_32bit(input + 1 * 16); - in[2] = load_pack_8_32bit(input + 2 * 16); - in[3] = load_pack_8_32bit(input + 3 * 16); - in[4] = load_pack_8_32bit(input + 4 * 16); - in[5] = load_pack_8_32bit(input + 5 * 16); - in[6] = load_pack_8_32bit(input + 6 * 16); - in[7] = load_pack_8_32bit(input + 7 * 16); - transpose_16bit_8x8(in, in); - + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); for (i = 8; i < 16; i++) { in[i] = _mm_setzero_si128(); } @@ -342,30 +282,15 @@ void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, for (i = 0; i < 2; i++) { in = all[i]; - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0)); - in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4)); - in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0)); - in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4)); - in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0)); - in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4)); - in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0)); - in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4)); - transpose_32bit_8x4(in, in); + highbd_load_transpose_32bit_8x4(input, 16, in); highbd_idct16x16_38_4col(in); input += 4 * 16; } for (i = 0; i < 16; i += 4) { int j; - out[0] = all[0][i + 0]; - out[1] = all[1][i + 0]; - out[2] = all[0][i + 1]; - out[3] = all[1][i + 1]; - out[4] = all[0][i + 2]; - out[5] = all[1][i + 2]; - out[6] = all[0][i + 3]; - out[7] = all[1][i + 3]; - transpose_32bit_8x4(out, out); + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); highbd_idct16x16_38_4col(out); for (j = 0; j < 16; ++j) { @@ -405,11 +330,7 @@ void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest, for (i = 0; i < 2; i++) { in = all[i]; - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - transpose_32bit_4x4(in, in); + highbd_load_transpose_32bit_4x4(input, 16, in); highbd_idct16x16_10_4col(in); input += 4 * 16; } diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 81187e0fd..c58e49c02 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -16,6 +16,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" static INLINE void extend_64bit(const __m128i in, @@ -84,6 +85,7 @@ static INLINE __m128i multiplication_round_shift_sse2( const __m128i pair_c = pair_set_epi32(c << 2, 0); __m128i t0, t1; + assert(c >= 0); t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); t0 = dct_const_round_shift_64bit(t0); @@ -99,6 +101,7 @@ static INLINE __m128i multiplication_neg_round_shift_sse2( const __m128i pair_c = pair_set_epi32(c << 2, 0); __m128i t0, t1; + assert(c >= 0); t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); t0 = _mm_sub_epi64(_mm_setzero_si128(), t0); @@ -118,6 +121,8 @@ static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); __m128i temp1[4], temp2[4], sign1[2], sign2[2]; + assert(c0 >= 0); + assert(c1 >= 0); abs_extend_64bit_sse2(in0, temp1, sign1); abs_extend_64bit_sse2(in1, temp2, sign2); temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); @@ -140,6 +145,34 @@ static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, *out1 = pack_4(temp2[0], temp2[1]); } +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0, + const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_round_shift_sse2(temp, sign, c0); + *out1 = multiplication_round_shift_sse2(temp, sign, c1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1); + *out1 = multiplication_round_shift_sse2(temp, sign, c0); +} + static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, const __m128i in1, __m128i *const out0, @@ -154,27 +187,17 @@ static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, *out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); } -// Note: c0 and c1 must be non negative. -static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0, - const int c1, __m128i *const out0, - __m128i *const out1) { - __m128i temp[2], sign[2]; - - abs_extend_64bit_sse2(in, temp, sign); - *out0 = multiplication_round_shift_sse2(temp, sign, c0); - *out1 = multiplication_round_shift_sse2(temp, sign, c1); -} - -// Note: c0 and c1 must be non negative. -static INLINE void highbd_multiplication_neg_sse2(const __m128i in, - const int c0, const int c1, - __m128i *const out0, - __m128i *const out1) { - __m128i temp[2], sign[2]; - - abs_extend_64bit_sse2(in, temp, sign); - *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1); - *out1 = multiplication_round_shift_sse2(temp, sign, c0); +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi32(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]); + i++; + } } static INLINE void highbd_idct8_stage4(const __m128i *const in, @@ -313,6 +336,44 @@ static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) { return _mm_packs_epi32(t0, t1); } +static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_pack_8_32bit(input + 0 * stride); + in[1] = load_pack_8_32bit(input + 1 * stride); + in[2] = load_pack_8_32bit(input + 2 * stride); + in[3] = load_pack_8_32bit(input + 3 * stride); + in[4] = load_pack_8_32bit(input + 4 * stride); + in[5] = load_pack_8_32bit(input + 5 * stride); + in[6] = load_pack_8_32bit(input + 6 * stride); + in[7] = load_pack_8_32bit(input + 7 * stride); + transpose_16bit_8x8(in, in); +} + +static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4)); + transpose_32bit_8x4(in, in); +} + +static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + transpose_32bit_4x4(in, in); +} + static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in, const int bd) { const __m128i final_rounding = _mm_set1_epi16(1 << 5); diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 66af7c699..68344ff79 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -73,10 +73,10 @@ static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, *out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); } -static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0, - const int c1, - __m128i *const out0, - __m128i *const out1) { +static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { __m128i temp[2]; extend_64bit(in, temp);