Merge "Fix 32x32 forward transform SSE2 version"

This commit is contained in:
Jingning Han 2013-09-03 08:58:03 -07:00 committed by Gerrit Code Review
commit 010c0ad0eb
3 changed files with 457 additions and 287 deletions

View File

@ -701,7 +701,7 @@ prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_b
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b_32x32 $ssse3_x86_64
specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
#
# Structured Similarity (SSIM)

View File

@ -1077,6 +1077,44 @@ static void dct32_1d(int *input, int *output, int round) {
output[30] = step[30];
output[31] = step[31];
// dump the magnitude by 4, hence the intermediate values are within
// the range of 16 bits.
if (round) {
output[0] = half_round_shift(output[0]);
output[1] = half_round_shift(output[1]);
output[2] = half_round_shift(output[2]);
output[3] = half_round_shift(output[3]);
output[4] = half_round_shift(output[4]);
output[5] = half_round_shift(output[5]);
output[6] = half_round_shift(output[6]);
output[7] = half_round_shift(output[7]);
output[8] = half_round_shift(output[8]);
output[9] = half_round_shift(output[9]);
output[10] = half_round_shift(output[10]);
output[11] = half_round_shift(output[11]);
output[12] = half_round_shift(output[12]);
output[13] = half_round_shift(output[13]);
output[14] = half_round_shift(output[14]);
output[15] = half_round_shift(output[15]);
output[16] = half_round_shift(output[16]);
output[17] = half_round_shift(output[17]);
output[18] = half_round_shift(output[18]);
output[19] = half_round_shift(output[19]);
output[20] = half_round_shift(output[20]);
output[21] = half_round_shift(output[21]);
output[22] = half_round_shift(output[22]);
output[23] = half_round_shift(output[23]);
output[24] = half_round_shift(output[24]);
output[25] = half_round_shift(output[25]);
output[26] = half_round_shift(output[26]);
output[27] = half_round_shift(output[27]);
output[28] = half_round_shift(output[28]);
output[29] = half_round_shift(output[29]);
output[30] = half_round_shift(output[30]);
output[31] = half_round_shift(output[31]);
}
// Stage 3
step[0] = output[0] + output[(8 - 1)];
step[1] = output[1] + output[(8 - 2)];
@ -1112,44 +1150,6 @@ static void dct32_1d(int *input, int *output, int round) {
step[30] = output[30] + output[25];
step[31] = output[31] + output[24];
// dump the magnitude by half, hence the intermediate values are within 1108
// the range of 16 bits.
if (round) {
step[0] = half_round_shift(step[0]);
step[1] = half_round_shift(step[1]);
step[2] = half_round_shift(step[2]);
step[3] = half_round_shift(step[3]);
step[4] = half_round_shift(step[4]);
step[5] = half_round_shift(step[5]);
step[6] = half_round_shift(step[6]);
step[7] = half_round_shift(step[7]);
step[8] = half_round_shift(step[8]);
step[9] = half_round_shift(step[9]);
step[10] = half_round_shift(step[10]);
step[11] = half_round_shift(step[11]);
step[12] = half_round_shift(step[12]);
step[13] = half_round_shift(step[13]);
step[14] = half_round_shift(step[14]);
step[15] = half_round_shift(step[15]);
step[16] = half_round_shift(step[16]);
step[17] = half_round_shift(step[17]);
step[18] = half_round_shift(step[18]);
step[19] = half_round_shift(step[19]);
step[20] = half_round_shift(step[20]);
step[21] = half_round_shift(step[21]);
step[22] = half_round_shift(step[22]);
step[23] = half_round_shift(step[23]);
step[24] = half_round_shift(step[24]);
step[25] = half_round_shift(step[25]);
step[26] = half_round_shift(step[26]);
step[27] = half_round_shift(step[27]);
step[28] = half_round_shift(step[28]);
step[29] = half_round_shift(step[29]);
step[30] = half_round_shift(step[30]);
step[31] = half_round_shift(step[31]);
}
// Stage 4
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];

View File

@ -403,6 +403,148 @@ void FDCT32x32_2D(int16_t *input,
step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
}
#if !FDCT32x32_HIGH_PRECISION
// dump the magnitude by half, hence the intermediate values are within
// the range of 16 bits.
if (1 == pass) {
__m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
__m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
__m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
__m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
__m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
__m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
__m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
__m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
__m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
__m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
__m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
__m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
__m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
__m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
__m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
__m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
__m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
__m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
__m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
__m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
__m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
__m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
__m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
__m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
__m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
__m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
__m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
__m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
__m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
__m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
__m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
__m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
step2[10] = _mm_add_epi16(step2[10], kOne);
step2[11] = _mm_add_epi16(step2[11], kOne);
step2[12] = _mm_add_epi16(step2[12], kOne);
step2[13] = _mm_add_epi16(step2[13], kOne);
step2[14] = _mm_add_epi16(step2[14], kOne);
step2[15] = _mm_add_epi16(step2[15], kOne);
step1[16] = _mm_add_epi16(step1[16], kOne);
step1[17] = _mm_add_epi16(step1[17], kOne);
step1[18] = _mm_add_epi16(step1[18], kOne);
step1[19] = _mm_add_epi16(step1[19], kOne);
step2[20] = _mm_add_epi16(step2[20], kOne);
step2[21] = _mm_add_epi16(step2[21], kOne);
step2[22] = _mm_add_epi16(step2[22], kOne);
step2[23] = _mm_add_epi16(step2[23], kOne);
step2[24] = _mm_add_epi16(step2[24], kOne);
step2[25] = _mm_add_epi16(step2[25], kOne);
step2[26] = _mm_add_epi16(step2[26], kOne);
step2[27] = _mm_add_epi16(step2[27], kOne);
step1[28] = _mm_add_epi16(step1[28], kOne);
step1[29] = _mm_add_epi16(step1[29], kOne);
step1[30] = _mm_add_epi16(step1[30], kOne);
step1[31] = _mm_add_epi16(step1[31], kOne);
step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
step2[10] = _mm_srai_epi16(step2[10], 2);
step2[11] = _mm_srai_epi16(step2[11], 2);
step2[12] = _mm_srai_epi16(step2[12], 2);
step2[13] = _mm_srai_epi16(step2[13], 2);
step2[14] = _mm_srai_epi16(step2[14], 2);
step2[15] = _mm_srai_epi16(step2[15], 2);
step1[16] = _mm_srai_epi16(step1[16], 2);
step1[17] = _mm_srai_epi16(step1[17], 2);
step1[18] = _mm_srai_epi16(step1[18], 2);
step1[19] = _mm_srai_epi16(step1[19], 2);
step2[20] = _mm_srai_epi16(step2[20], 2);
step2[21] = _mm_srai_epi16(step2[21], 2);
step2[22] = _mm_srai_epi16(step2[22], 2);
step2[23] = _mm_srai_epi16(step2[23], 2);
step2[24] = _mm_srai_epi16(step2[24], 2);
step2[25] = _mm_srai_epi16(step2[25], 2);
step2[26] = _mm_srai_epi16(step2[26], 2);
step2[27] = _mm_srai_epi16(step2[27], 2);
step1[28] = _mm_srai_epi16(step1[28], 2);
step1[29] = _mm_srai_epi16(step1[29], 2);
step1[30] = _mm_srai_epi16(step1[30], 2);
step1[31] = _mm_srai_epi16(step1[31], 2);
}
#endif
#if FDCT32x32_HIGH_PRECISION
if (pass == 0) {
#endif
// Stage 3
{
step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
@ -469,144 +611,6 @@ void FDCT32x32_2D(int16_t *input,
step3[31] = _mm_add_epi16(step2[24], step1[31]);
}
#if !FDCT32x32_HIGH_PRECISION
// dump the magnitude by half, hence the intermediate values are within
// the range of 16 bits.
if (1 == pass) {
__m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
__m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
__m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
__m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
__m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
__m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
__m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
__m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
__m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
__m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
__m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
__m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
__m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
__m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
__m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
__m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
__m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
__m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
__m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
__m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
__m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
__m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
__m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
__m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
__m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
__m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
__m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
__m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
__m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
__m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
__m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
__m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
step3[10] = _mm_add_epi16(step3[10], kOne);
step3[11] = _mm_add_epi16(step3[11], kOne);
step3[12] = _mm_add_epi16(step3[12], kOne);
step3[13] = _mm_add_epi16(step3[13], kOne);
step2[14] = _mm_add_epi16(step2[14], kOne);
step2[15] = _mm_add_epi16(step2[15], kOne);
step3[16] = _mm_add_epi16(step3[16], kOne);
step3[17] = _mm_add_epi16(step3[17], kOne);
step3[18] = _mm_add_epi16(step3[18], kOne);
step3[19] = _mm_add_epi16(step3[19], kOne);
step3[20] = _mm_add_epi16(step3[20], kOne);
step3[21] = _mm_add_epi16(step3[21], kOne);
step3[22] = _mm_add_epi16(step3[22], kOne);
step3[23] = _mm_add_epi16(step3[23], kOne);
step3[24] = _mm_add_epi16(step3[24], kOne);
step3[25] = _mm_add_epi16(step3[25], kOne);
step3[26] = _mm_add_epi16(step3[26], kOne);
step3[27] = _mm_add_epi16(step3[27], kOne);
step3[28] = _mm_add_epi16(step3[28], kOne);
step3[29] = _mm_add_epi16(step3[29], kOne);
step3[30] = _mm_add_epi16(step3[30], kOne);
step3[31] = _mm_add_epi16(step3[31], kOne);
step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
step3[10] = _mm_srai_epi16(step3[10], 2);
step3[11] = _mm_srai_epi16(step3[11], 2);
step3[12] = _mm_srai_epi16(step3[12], 2);
step3[13] = _mm_srai_epi16(step3[13], 2);
step2[14] = _mm_srai_epi16(step2[14], 2);
step2[15] = _mm_srai_epi16(step2[15], 2);
step3[16] = _mm_srai_epi16(step3[16], 2);
step3[17] = _mm_srai_epi16(step3[17], 2);
step3[18] = _mm_srai_epi16(step3[18], 2);
step3[19] = _mm_srai_epi16(step3[19], 2);
step3[20] = _mm_srai_epi16(step3[20], 2);
step3[21] = _mm_srai_epi16(step3[21], 2);
step3[22] = _mm_srai_epi16(step3[22], 2);
step3[23] = _mm_srai_epi16(step3[23], 2);
step3[24] = _mm_srai_epi16(step3[24], 2);
step3[25] = _mm_srai_epi16(step3[25], 2);
step3[26] = _mm_srai_epi16(step3[26], 2);
step3[27] = _mm_srai_epi16(step3[27], 2);
step3[28] = _mm_srai_epi16(step3[28], 2);
step3[29] = _mm_srai_epi16(step3[29], 2);
step3[30] = _mm_srai_epi16(step3[30], 2);
step3[31] = _mm_srai_epi16(step3[31], 2);
}
#endif
#if FDCT32x32_HIGH_PRECISION
if (pass == 0) {
#endif
// Stage 4
{
step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
@ -1158,25 +1162,146 @@ void FDCT32x32_2D(int16_t *input,
const __m128i mask16 = _mm_set1_epi32(0x80008000);
const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
// start using 32-bit operations
// stage 3
{
// expanding to 32-bit length priori to addition operations
lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
}
{
const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
// dct_const_round_shift
const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
}
{
lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
}
// stage 4
{
// expanding to 32-bit length priori to addition operations
lstep3[ 0] = k_cvtlo_epi16(step3[ 0], mask16, kZero);
lstep3[ 1] = k_cvthi_epi16(step3[ 0], mask16, kZero);
lstep3[ 2] = k_cvtlo_epi16(step3[ 1], mask16, kZero);
lstep3[ 3] = k_cvthi_epi16(step3[ 1], mask16, kZero);
lstep3[ 4] = k_cvtlo_epi16(step3[ 2], mask16, kZero);
lstep3[ 5] = k_cvthi_epi16(step3[ 2], mask16, kZero);
lstep3[ 6] = k_cvtlo_epi16(step3[ 3], mask16, kZero);
lstep3[ 7] = k_cvthi_epi16(step3[ 3], mask16, kZero);
lstep3[20] = k_cvtlo_epi16(step3[10], mask16, kZero);
lstep3[21] = k_cvthi_epi16(step3[10], mask16, kZero);
lstep3[22] = k_cvtlo_epi16(step3[11], mask16, kZero);
lstep3[23] = k_cvthi_epi16(step3[11], mask16, kZero);
lstep3[24] = k_cvtlo_epi16(step3[12], mask16, kZero);
lstep3[25] = k_cvthi_epi16(step3[12], mask16, kZero);
lstep3[26] = k_cvtlo_epi16(step3[13], mask16, kZero);
lstep3[27] = k_cvthi_epi16(step3[13], mask16, kZero);
lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
@ -1212,88 +1337,150 @@ void FDCT32x32_2D(int16_t *input,
lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
}
{
const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
// dct_const_round_shift
const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
lstep1[10] = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
lstep1[11] = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
lstep1[12] = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
lstep1[13] = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
// to be continued...
//
const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
// TODO(jingning): manually inline k_madd_epi32_ to further hide
// instruction latency.
v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
u[0] = k_packs_epi64(v[0], v[1]);
u[1] = k_packs_epi64(v[2], v[3]);
u[2] = k_packs_epi64(v[4], v[5]);
u[3] = k_packs_epi64(v[6], v[7]);
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
}
{
const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
// dct_const_round_shift
const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
lstep1[36] = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
lstep1[37] = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
lstep1[38] = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
lstep1[39] = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
lstep1[40] = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
lstep1[41] = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
lstep1[42] = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
lstep1[43] = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
lstep1[52] = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
lstep1[53] = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
lstep1[54] = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
lstep1[55] = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
lstep1[56] = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
lstep1[57] = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
lstep1[58] = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
lstep1[59] = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
v[10] = k_madd_epi32(u[10], k32_m24_m08);
v[11] = k_madd_epi32(u[11], k32_m24_m08);
v[12] = k_madd_epi32(u[12], k32_m24_m08);
v[13] = k_madd_epi32(u[13], k32_m24_m08);
v[14] = k_madd_epi32(u[14], k32_m24_m08);
v[15] = k_madd_epi32(u[15], k32_m24_m08);
v[16] = k_madd_epi32(u[12], k32_m08_p24);
v[17] = k_madd_epi32(u[13], k32_m08_p24);
v[18] = k_madd_epi32(u[14], k32_m08_p24);
v[19] = k_madd_epi32(u[15], k32_m08_p24);
v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
v[22] = k_madd_epi32(u[10], k32_m08_p24);
v[23] = k_madd_epi32(u[11], k32_m08_p24);
v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
u[ 5] = k_packs_epi64(v[10], v[11]);
u[ 6] = k_packs_epi64(v[12], v[13]);
u[ 7] = k_packs_epi64(v[14], v[15]);
u[ 8] = k_packs_epi64(v[16], v[17]);
u[ 9] = k_packs_epi64(v[18], v[19]);
u[10] = k_packs_epi64(v[20], v[21]);
u[11] = k_packs_epi64(v[22], v[23]);
u[12] = k_packs_epi64(v[24], v[25]);
u[13] = k_packs_epi64(v[26], v[27]);
u[14] = k_packs_epi64(v[28], v[29]);
u[15] = k_packs_epi64(v[30], v[31]);
v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
}
// stage 5
{
lstep3[ 8] = k_cvtlo_epi16(step3[4], mask16, kZero);
lstep3[ 9] = k_cvthi_epi16(step3[4], mask16, kZero);
lstep3[14] = k_cvtlo_epi16(step3[7], mask16, kZero);
lstep3[15] = k_cvthi_epi16(step3[7], mask16, kZero);
lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
@ -1465,23 +1652,6 @@ void FDCT32x32_2D(int16_t *input,
lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
}
{
lstep3[32] = k_cvtlo_epi16(step3[16], mask16, kZero);
lstep3[33] = k_cvthi_epi16(step3[16], mask16, kZero);
lstep3[34] = k_cvtlo_epi16(step3[17], mask16, kZero);
lstep3[35] = k_cvthi_epi16(step3[17], mask16, kZero);
lstep3[44] = k_cvtlo_epi16(step3[22], mask16, kZero);
lstep3[45] = k_cvthi_epi16(step3[22], mask16, kZero);
lstep3[46] = k_cvtlo_epi16(step3[23], mask16, kZero);
lstep3[47] = k_cvthi_epi16(step3[23], mask16, kZero);
lstep3[48] = k_cvtlo_epi16(step3[24], mask16, kZero);
lstep3[49] = k_cvthi_epi16(step3[24], mask16, kZero);
lstep3[50] = k_cvtlo_epi16(step3[25], mask16, kZero);
lstep3[51] = k_cvthi_epi16(step3[25], mask16, kZero);
lstep3[60] = k_cvtlo_epi16(step3[30], mask16, kZero);
lstep3[61] = k_cvthi_epi16(step3[30], mask16, kZero);
lstep3[62] = k_cvtlo_epi16(step3[31], mask16, kZero);
lstep3[63] = k_cvthi_epi16(step3[31], mask16, kZero);
lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);