Merge "Fold adds in 16->32-bit converts in SSE2/AVX2 fDCT"

This commit is contained in:
Kyle Siefring 2018-02-21 23:12:07 +00:00 committed by Gerrit Code Review
commit dccb8b45bb
2 changed files with 181 additions and 324 deletions

View File

@ -1374,59 +1374,37 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
__m256i lstep1[64], lstep2[64], lstep3[64]; __m256i lstep1[64], lstep2[64], lstep3[64];
__m256i u[32], v[32], sign[16]; __m256i u[32], v[32], sign[16];
const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
const __m256i k__pOne_mOne = pair256_set_epi16(1, -1);
// start using 32-bit operations // start using 32-bit operations
// stage 3 // stage 3
{ {
// expanding to 32-bit length priori to addition operations // expanding to 32-bit length while adding and subtracting
lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero); lstep2[0] = _mm256_unpacklo_epi16(step2[0], step2[7]);
lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero); lstep2[1] = _mm256_unpackhi_epi16(step2[0], step2[7]);
lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero); lstep2[2] = _mm256_unpacklo_epi16(step2[1], step2[6]);
lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero); lstep2[3] = _mm256_unpackhi_epi16(step2[1], step2[6]);
lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero); lstep2[4] = _mm256_unpacklo_epi16(step2[2], step2[5]);
lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero); lstep2[5] = _mm256_unpackhi_epi16(step2[2], step2[5]);
lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero); lstep2[6] = _mm256_unpacklo_epi16(step2[3], step2[4]);
lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero); lstep2[7] = _mm256_unpackhi_epi16(step2[3], step2[4]);
lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]); lstep3[0] = _mm256_madd_epi16(lstep2[0], kOne);
lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]); lstep3[1] = _mm256_madd_epi16(lstep2[1], kOne);
lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]); lstep3[2] = _mm256_madd_epi16(lstep2[2], kOne);
lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]); lstep3[3] = _mm256_madd_epi16(lstep2[3], kOne);
lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]); lstep3[4] = _mm256_madd_epi16(lstep2[4], kOne);
lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]); lstep3[5] = _mm256_madd_epi16(lstep2[5], kOne);
lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]); lstep3[6] = _mm256_madd_epi16(lstep2[6], kOne);
lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]); lstep3[7] = _mm256_madd_epi16(lstep2[7], kOne);
lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]); lstep3[8] = _mm256_madd_epi16(lstep2[6], k__pOne_mOne);
lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]); lstep3[9] = _mm256_madd_epi16(lstep2[7], k__pOne_mOne);
lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]); lstep3[10] = _mm256_madd_epi16(lstep2[4], k__pOne_mOne);
lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]); lstep3[11] = _mm256_madd_epi16(lstep2[5], k__pOne_mOne);
lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]); lstep3[12] = _mm256_madd_epi16(lstep2[2], k__pOne_mOne);
lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]); lstep3[13] = _mm256_madd_epi16(lstep2[3], k__pOne_mOne);
lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]); lstep3[14] = _mm256_madd_epi16(lstep2[0], k__pOne_mOne);
lstep3[15] = _mm256_madd_epi16(lstep2[1], k__pOne_mOne);
} }
{ {
const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
@ -1468,126 +1446,76 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
} }
{ {
lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero); lstep1[32] = _mm256_unpacklo_epi16(step1[16], step2[23]);
lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero); lstep1[33] = _mm256_unpackhi_epi16(step1[16], step2[23]);
lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero); lstep1[34] = _mm256_unpacklo_epi16(step1[17], step2[22]);
lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero); lstep1[35] = _mm256_unpackhi_epi16(step1[17], step2[22]);
lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero); lstep1[36] = _mm256_unpacklo_epi16(step1[18], step2[21]);
lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero); lstep1[37] = _mm256_unpackhi_epi16(step1[18], step2[21]);
lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero); lstep1[38] = _mm256_unpacklo_epi16(step1[19], step2[20]);
lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero); lstep1[39] = _mm256_unpackhi_epi16(step1[19], step2[20]);
lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero); lstep1[56] = _mm256_unpacklo_epi16(step1[28], step2[27]);
lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero); lstep1[57] = _mm256_unpackhi_epi16(step1[28], step2[27]);
lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero); lstep1[58] = _mm256_unpacklo_epi16(step1[29], step2[26]);
lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero); lstep1[59] = _mm256_unpackhi_epi16(step1[29], step2[26]);
lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero); lstep1[60] = _mm256_unpacklo_epi16(step1[30], step2[25]);
lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero); lstep1[61] = _mm256_unpackhi_epi16(step1[30], step2[25]);
lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero); lstep1[62] = _mm256_unpacklo_epi16(step1[31], step2[24]);
lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero); lstep1[63] = _mm256_unpackhi_epi16(step1[31], step2[24]);
lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]); lstep3[32] = _mm256_madd_epi16(lstep1[32], kOne);
lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]); lstep3[33] = _mm256_madd_epi16(lstep1[33], kOne);
lstep3[34] = _mm256_madd_epi16(lstep1[34], kOne);
lstep3[35] = _mm256_madd_epi16(lstep1[35], kOne);
lstep3[36] = _mm256_madd_epi16(lstep1[36], kOne);
lstep3[37] = _mm256_madd_epi16(lstep1[37], kOne);
lstep3[38] = _mm256_madd_epi16(lstep1[38], kOne);
lstep3[39] = _mm256_madd_epi16(lstep1[39], kOne);
lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]); lstep3[40] = _mm256_madd_epi16(lstep1[38], k__pOne_mOne);
lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]); lstep3[41] = _mm256_madd_epi16(lstep1[39], k__pOne_mOne);
lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]); lstep3[42] = _mm256_madd_epi16(lstep1[36], k__pOne_mOne);
lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]); lstep3[43] = _mm256_madd_epi16(lstep1[37], k__pOne_mOne);
lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]); lstep3[44] = _mm256_madd_epi16(lstep1[34], k__pOne_mOne);
lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]); lstep3[45] = _mm256_madd_epi16(lstep1[35], k__pOne_mOne);
lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]); lstep3[46] = _mm256_madd_epi16(lstep1[32], k__pOne_mOne);
lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]); lstep3[47] = _mm256_madd_epi16(lstep1[33], k__pOne_mOne);
lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]); lstep3[48] = _mm256_madd_epi16(lstep1[62], k__pOne_mOne);
lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]); lstep3[49] = _mm256_madd_epi16(lstep1[63], k__pOne_mOne);
lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]); lstep3[50] = _mm256_madd_epi16(lstep1[60], k__pOne_mOne);
lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]); lstep3[51] = _mm256_madd_epi16(lstep1[61], k__pOne_mOne);
lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]); lstep3[52] = _mm256_madd_epi16(lstep1[58], k__pOne_mOne);
lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]); lstep3[53] = _mm256_madd_epi16(lstep1[59], k__pOne_mOne);
lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]); lstep3[54] = _mm256_madd_epi16(lstep1[56], k__pOne_mOne);
lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]); lstep3[55] = _mm256_madd_epi16(lstep1[57], k__pOne_mOne);
lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]); lstep3[56] = _mm256_madd_epi16(lstep1[56], kOne);
lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]); lstep3[57] = _mm256_madd_epi16(lstep1[57], kOne);
lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]); lstep3[58] = _mm256_madd_epi16(lstep1[58], kOne);
lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]); lstep3[59] = _mm256_madd_epi16(lstep1[59], kOne);
lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]); lstep3[60] = _mm256_madd_epi16(lstep1[60], kOne);
lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]); lstep3[61] = _mm256_madd_epi16(lstep1[61], kOne);
lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]); lstep3[62] = _mm256_madd_epi16(lstep1[62], kOne);
lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]); lstep3[63] = _mm256_madd_epi16(lstep1[63], kOne);
lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
} }
// stage 4 // stage 4
{ {
// expanding to 32-bit length priori to addition operations // expanding to 32-bit length prior to addition operations
lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero); sign[0] = _mm256_cmpgt_epi16(kZero, step2[8]);
lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero); sign[1] = _mm256_cmpgt_epi16(kZero, step2[9]);
lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero); sign[2] = _mm256_cmpgt_epi16(kZero, step2[14]);
lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero); sign[3] = _mm256_cmpgt_epi16(kZero, step2[15]);
lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); lstep2[16] = _mm256_unpacklo_epi16(step2[8], sign[0]);
lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); lstep2[17] = _mm256_unpackhi_epi16(step2[8], sign[0]);
lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); lstep2[18] = _mm256_unpacklo_epi16(step2[9], sign[1]);
lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero); lstep2[19] = _mm256_unpackhi_epi16(step2[9], sign[1]);
lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne); lstep2[28] = _mm256_unpacklo_epi16(step2[14], sign[2]);
lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne); lstep2[29] = _mm256_unpackhi_epi16(step2[14], sign[2]);
lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne); lstep2[30] = _mm256_unpacklo_epi16(step2[15], sign[3]);
lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne); lstep2[31] = _mm256_unpackhi_epi16(step2[15], sign[3]);
lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);

View File

@ -101,6 +101,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i kZero = _mm_set1_epi16(0); const __m128i kZero = _mm_set1_epi16(0);
const __m128i kOne = _mm_set1_epi16(1); const __m128i kOne = _mm_set1_epi16(1);
// Do the two transform/transpose passes // Do the two transform/transpose passes
int pass; int pass;
#if DCT_HIGH_BIT_DEPTH #if DCT_HIGH_BIT_DEPTH
@ -1508,59 +1509,37 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
__m128i lstep1[64], lstep2[64], lstep3[64]; __m128i lstep1[64], lstep2[64], lstep3[64];
__m128i u[32], v[32], sign[16]; __m128i u[32], v[32], sign[16];
const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
// start using 32-bit operations // start using 32-bit operations
// stage 3 // stage 3
{ {
// expanding to 32-bit length priori to addition operations // expanding to 32-bit length while adding and subtracting
lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
} }
{ {
const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@ -1594,126 +1573,76 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
} }
{ {
lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
} }
// stage 4 // stage 4
{ {
// expanding to 32-bit length priori to addition operations // expanding to 32-bit length prior to addition operations
lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);