From 6615706af28290ff2ae8781658b00ccc2b73b026 Mon Sep 17 00:00:00 2001 From: Deb Mukherjee Date: Wed, 3 Dec 2014 15:18:44 -0800 Subject: [PATCH] sse2 visual studio build fix Change-Id: Id8c8c3be882bcd92afea3ccec6ebdf3f208d28ef --- vp9/encoder/x86/vp9_dct32x32_sse2.c | 245 +++++++++--------- vp9/encoder/x86/vp9_dct_impl_sse2.c | 203 +++++++-------- vp9/encoder/x86/vp9_dct_sse2.c | 28 +-- vp9/encoder/x86/vp9_dct_sse2.h | 369 +++++++++++++++++----------- 4 files changed, 483 insertions(+), 362 deletions(-) diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c index 7ec126e4b..099993aa6 100644 --- a/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -269,8 +269,9 @@ void FDCT32x32_2D(const int16_t *input, step1[30] = SUB_EPI16(in01, in30); step1[31] = SUB_EPI16(in00, in31); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1[0], step1[1], step1[2], - step1[3], step1[28], step1[29], step1[30], step1[31]); + overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], + &step1[3], &step1[28], &step1[29], + &step1[30], &step1[31]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -295,9 +296,9 @@ void FDCT32x32_2D(const int16_t *input, step1[26] = SUB_EPI16(in05, in26); step1[27] = SUB_EPI16(in04, in27); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1[4], step1[5], step1[6], - step1[7], step1[24], step1[25], - step1[26], step1[27]); + overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], + &step1[7], &step1[24], &step1[25], + &step1[26], &step1[27]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -322,9 +323,9 @@ void FDCT32x32_2D(const int16_t *input, step1[22] = SUB_EPI16(in09, in22); step1[23] = SUB_EPI16(in08, in23); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1[8], step1[9], step1[10], - step1[11], step1[20], step1[21], - step1[22], step1[23]); + overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], + &step1[11], &step1[20], &step1[21], + &step1[22], &step1[23]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -349,9 +350,9 @@ void FDCT32x32_2D(const int16_t *input, step1[18] = SUB_EPI16(in13, in18); step1[19] = SUB_EPI16(in12, in19); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1[12], step1[13], step1[14], - step1[15], step1[16], step1[17], - step1[18], step1[19]); + overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], + &step1[15], &step1[16], &step1[17], + &step1[18], &step1[19]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -379,10 +380,10 @@ void FDCT32x32_2D(const int16_t *input, step2[15] = SUB_EPI16(step1[0], step1[15]); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x16( - step2[0], step2[1], step2[2], step2[3], - step2[4], step2[5], step2[6], step2[7], - step2[8], step2[9], step2[10], step2[11], - step2[12], step2[13], step2[14], step2[15]); + &step2[0], &step2[1], &step2[2], &step2[3], + &step2[4], &step2[5], &step2[6], &step2[7], + &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -460,9 +461,9 @@ void FDCT32x32_2D(const int16_t *input, step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step2[20], step2[21], step2[22], - step2[23], step2[24], step2[25], - step2[26], step2[27]); + overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], + &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -544,14 +545,14 @@ void FDCT32x32_2D(const int16_t *input, step1[31] = SUB_EPI16(step1[31], s3_31_0); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x32( - step2[0], step2[1], step2[2], step2[3], - step2[4], step2[5], step2[6], step2[7], - step2[8], step2[9], step2[10], step2[11], - step2[12], step2[13], step2[14], step2[15], - step1[16], step1[17], step1[18], step1[19], - step2[20], step2[21], step2[22], step2[23], - step2[24], step2[25], step2[26], step2[27], - step1[28], step1[29], step1[30], step1[31]); + &step2[0], &step2[1], &step2[2], &step2[3], + &step2[4], &step2[5], &step2[6], &step2[7], + &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15], + &step1[16], &step1[17], &step1[18], &step1[19], + &step2[20], &step2[21], &step2[22], &step2[23], + &step2[24], &step2[25], &step2[26], &step2[27], + &step1[28], &step1[29], &step1[30], &step1[31]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -639,9 +640,9 @@ void FDCT32x32_2D(const int16_t *input, step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step3[0], step3[1], step3[2], - step3[3], step3[4], step3[5], - step3[6], step3[7]); + overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], + &step3[3], &step3[4], &step3[5], + &step3[6], &step3[7]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -687,8 +688,8 @@ void FDCT32x32_2D(const int16_t *input, step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(step3[10], step3[11], - step3[12], step3[13]); + overflow = check_epi16_overflow_x4(&step3[10], &step3[11], + &step3[12], &step3[13]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -717,10 +718,10 @@ void FDCT32x32_2D(const int16_t *input, step3[31] = ADD_EPI16(step2[24], step1[31]); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x16( - step3[16], step3[17], step3[18], step3[19], - step3[20], step3[21], step3[22], step3[23], - step3[24], step3[25], step3[26], step3[27], - step3[28], step3[29], step3[30], step3[31]); + &step3[16], &step3[17], &step3[18], &step3[19], + &step3[20], &step3[21], &step3[22], &step3[23], + &step3[24], &step3[25], &step3[26], &step3[27], + &step3[28], &step3[29], &step3[30], &step3[31]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -747,10 +748,10 @@ void FDCT32x32_2D(const int16_t *input, step1[15] = ADD_EPI16(step3[12], step2[15]); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x16( - step1[0], step1[1], step1[2], step1[3], - step1[4], step1[5], step1[6], step1[7], - step1[8], step1[9], step1[10], step1[11], - step1[12], step1[13], step1[14], step1[15]); + &step1[0], &step1[1], &step1[2], &step1[3], + &step1[4], &step1[5], &step1[6], &step1[7], + &step1[8], &step1[9], &step1[10], &step1[11], + &step1[12], &step1[13], &step1[14], &step1[15]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -780,7 +781,7 @@ void FDCT32x32_2D(const int16_t *input, step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(step1[5], step1[6]); + overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -858,9 +859,9 @@ void FDCT32x32_2D(const int16_t *input, step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1[18], step1[19], step1[20], - step1[21], step1[26], step1[27], - step1[28], step1[29]); + overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], + &step1[21], &step1[26], &step1[27], + &step1[28], &step1[29]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -877,8 +878,8 @@ void FDCT32x32_2D(const int16_t *input, step2[6] = SUB_EPI16(step3[7], step1[6]); step2[7] = ADD_EPI16(step1[6], step3[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(step2[4], step2[5], - step2[6], step2[7]); + overflow = check_epi16_overflow_x4(&step2[4], &step2[5], + &step2[6], &step2[7]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -924,7 +925,8 @@ void FDCT32x32_2D(const int16_t *input, out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); out[24] = _mm_packs_epi32(out_24_6, out_24_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(out[0], out[16], out[8], out[24]); + overflow = check_epi16_overflow_x4(&out[0], &out[16], + &out[8], &out[24]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -970,8 +972,8 @@ void FDCT32x32_2D(const int16_t *input, step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(step2[9], step2[10], - step2[13], step2[14]); + overflow = check_epi16_overflow_x4(&step2[9], &step2[10], + &step2[13], &step2[14]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1000,10 +1002,10 @@ void FDCT32x32_2D(const int16_t *input, step2[31] = ADD_EPI16(step1[28], step3[31]); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x16( - step2[16], step2[17], step2[18], step2[19], - step2[20], step2[21], step2[22], step2[23], - step2[24], step2[25], step2[26], step2[27], - step2[28], step2[29], step2[30], step2[31]); + &step2[16], &step2[17], &step2[18], &step2[19], + &step2[20], &step2[21], &step2[22], &step2[23], + &step2[24], &step2[25], &step2[26], &step2[27], + &step2[28], &step2[29], &step2[30], &step2[31]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1054,7 +1056,8 @@ void FDCT32x32_2D(const int16_t *input, out[12] = _mm_packs_epi32(out_12_6, out_12_7); out[28] = _mm_packs_epi32(out_28_6, out_28_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(out[4], out[20], out[12], out[28]); + overflow = check_epi16_overflow_x4(&out[4], &out[20], + &out[12], &out[28]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1074,9 +1077,9 @@ void FDCT32x32_2D(const int16_t *input, step3[14] = SUB_EPI16(step1[15], step2[14]); step3[15] = ADD_EPI16(step2[14], step1[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step3[8], step3[9], step3[10], - step3[11], step3[12], step3[13], - step3[14], step3[15]); + overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], + &step3[11], &step3[12], &step3[13], + &step3[14], &step3[15]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1155,9 +1158,9 @@ void FDCT32x32_2D(const int16_t *input, step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step3[17], step3[18], step3[21], - step3[22], step3[25], step3[26], - step3[29], step3[30]); + overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], + &step3[22], &step3[25], &step3[26], + &step3[29], &step3[30]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1236,8 +1239,9 @@ void FDCT32x32_2D(const int16_t *input, out[14] = _mm_packs_epi32(out_14_6, out_14_7); out[30] = _mm_packs_epi32(out_30_6, out_30_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(out[2], out[18], out[10], out[26], - out[6], out[22], out[14], out[30]); + overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10], + &out[26], &out[6], &out[22], + &out[14], &out[30]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1266,10 +1270,10 @@ void FDCT32x32_2D(const int16_t *input, step1[31] = ADD_EPI16(step3[30], step2[31]); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x16( - step1[16], step1[17], step1[18], step1[19], - step1[20], step1[21], step1[22], step1[23], - step1[24], step1[25], step1[26], step1[27], - step1[28], step1[29], step1[30], step1[31]); + &step1[16], &step1[17], &step1[18], &step1[19], + &step1[20], &step1[21], &step1[22], &step1[23], + &step1[24], &step1[25], &step1[26], &step1[27], + &step1[28], &step1[29], &step1[30], &step1[31]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1348,8 +1352,9 @@ void FDCT32x32_2D(const int16_t *input, out[15] = _mm_packs_epi32(out_15_6, out_15_7); out[31] = _mm_packs_epi32(out_31_6, out_31_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(out[1], out[17], out[9], out[25], - out[7], out[23], out[15], out[31]); + overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9], + &out[25], &out[7], &out[23], + &out[15], &out[31]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1427,8 +1432,9 @@ void FDCT32x32_2D(const int16_t *input, out[11] = _mm_packs_epi32(out_11_6, out_11_7); out[27] = _mm_packs_epi32(out_27_6, out_27_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(out[5], out[21], out[13], out[29], - out[3], out[19], out[11], out[27]); + overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13], + &out[29], &out[3], &out[19], + &out[11], &out[27]); if (overflow) { if (pass == 0) HIGH_FDCT32x32_2D_C(input, output_org, stride); @@ -1697,8 +1703,8 @@ void FDCT32x32_2D(const int16_t *input, v[6] = k_madd_epi32(u[2], k32_p16_p16); v[7] = k_madd_epi32(u[3], k32_p16_p16); #if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_8(v[0], v[1], v[2], v[3], v[4], v[5], - v[6], v[7], &kZero); + overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], + &v[4], &v[5], &v[6], &v[7], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -1776,10 +1782,11 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], - v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23], - v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], + &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -1883,8 +1890,9 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_16( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -1959,7 +1967,8 @@ void FDCT32x32_2D(const int16_t *input, out[ 8] = _mm_packs_epi32(u[4], u[5]); out[24] = _mm_packs_epi32(u[6], u[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(out[0], out[16], out[8], out[24]); + overflow = check_epi16_overflow_x4(&out[0], &out[16], + &out[8], &out[24]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -1999,8 +2008,9 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_16( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2110,8 +2120,9 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_16( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2185,7 +2196,8 @@ void FDCT32x32_2D(const int16_t *input, out[12] = _mm_packs_epi32(u[4], u[5]); out[28] = _mm_packs_epi32(u[6], u[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(out[4], out[20], out[12], out[28]); + overflow = check_epi16_overflow_x4(&out[4], &out[20], + &out[12], &out[28]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2271,10 +2283,11 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], - v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23], - v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], + &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2394,10 +2407,11 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], - v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23], - v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], + &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2531,8 +2545,9 @@ void FDCT32x32_2D(const int16_t *input, out[14] = _mm_packs_epi32(u[12], u[13]); out[30] = _mm_packs_epi32(u[14], u[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(out[2], out[18], out[10], out[26], - out[6], out[22], out[14], out[30]); + overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10], + &out[26], &out[6], &out[22], + &out[14], &out[30]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2636,10 +2651,11 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], - v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23], - v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], + &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2773,8 +2789,9 @@ void FDCT32x32_2D(const int16_t *input, out[15] = _mm_packs_epi32(u[12], u[13]); out[31] = _mm_packs_epi32(u[14], u[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(out[1], out[17], out[9], out[25], - out[7], out[23], out[15], out[31]); + overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9], + &out[25], &out[7], &out[23], + &out[15], &out[31]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2843,10 +2860,11 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], - v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], - v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23], - v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], + &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], + &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], + &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], + &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2980,8 +2998,9 @@ void FDCT32x32_2D(const int16_t *input, out[11] = _mm_packs_epi32(u[12], u[13]); out[27] = _mm_packs_epi32(u[14], u[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(out[5], out[21], out[13], out[29], - out[3], out[19], out[11], out[27]); + overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13], + &out[29], &out[3], &out[19], + &out[11], &out[27]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -3107,14 +3126,14 @@ void FDCT32x32_2D(const int16_t *input, // Process next 8x8 output0 += 8; } else { - storeu_output(tr2_0, (output1 + 0 * 32)); - storeu_output(tr2_1, (output1 + 1 * 32)); - storeu_output(tr2_2, (output1 + 2 * 32)); - storeu_output(tr2_3, (output1 + 3 * 32)); - storeu_output(tr2_4, (output1 + 4 * 32)); - storeu_output(tr2_5, (output1 + 5 * 32)); - storeu_output(tr2_6, (output1 + 6 * 32)); - storeu_output(tr2_7, (output1 + 7 * 32)); + storeu_output(&tr2_0, (output1 + 0 * 32)); + storeu_output(&tr2_1, (output1 + 1 * 32)); + storeu_output(&tr2_2, (output1 + 2 * 32)); + storeu_output(&tr2_3, (output1 + 3 * 32)); + storeu_output(&tr2_4, (output1 + 4 * 32)); + storeu_output(&tr2_5, (output1 + 5 * 32)); + storeu_output(&tr2_6, (output1 + 6 * 32)); + storeu_output(&tr2_7, (output1 + 7 * 32)); // Process next 8x8 output1 += 8; } diff --git a/vp9/encoder/x86/vp9_dct_impl_sse2.c b/vp9/encoder/x86/vp9_dct_impl_sse2.c index 3fdde83da..12fa747e8 100644 --- a/vp9/encoder/x86/vp9_dct_impl_sse2.c +++ b/vp9/encoder/x86/vp9_dct_impl_sse2.c @@ -75,7 +75,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // This second rounding constant saves doing some extra adds at the end const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING +(DCT_CONST_ROUNDING << 1)); - const int DCT_CONST_BITS2 = DCT_CONST_BITS+2; + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); __m128i in0, in1; @@ -170,7 +170,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i x0 = _mm_packs_epi32(w0, w1); const __m128i x1 = _mm_packs_epi32(w2, w3); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(x0, x1); + overflow = check_epi16_overflow_x2(&x0, &x1); if (overflow) { vp9_highbd_fdct4x4_c(input, output, stride); return; @@ -192,7 +192,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // t0 = [c0 c1 c8 c9 c4 c5 cC cD] // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(t0, t1); + overflow = check_epi16_overflow_x2(&t0, &t1); if (overflow) { vp9_highbd_fdct4x4_c(input, output, stride); return; @@ -231,7 +231,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i x0 = _mm_packs_epi32(w0, w1); const __m128i x1 = _mm_packs_epi32(w2, w3); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(x0, x1); + overflow = check_epi16_overflow_x2(&x0, &x1); if (overflow) { vp9_highbd_fdct4x4_c(input, output, stride); return; @@ -254,8 +254,8 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // Post-condition (v + 1) >> 2 is now incorporated into previous // add and right-shift commands. Only 2 store instructions needed // because we are using the fact that 1/3 are stored just after 0/2. - storeu_output(in0, output + 0 * 4); - storeu_output(in1, output + 2 * 4); + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); } @@ -314,7 +314,8 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i q7 = SUB_EPI16(in0, in7); #if DCT_HIGH_BIT_DEPTH if (pass == 1) { - overflow = check_epi16_overflow_x8(q0, q1, q2, q3, q4, q5, q6, q7); + overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, + &q4, &q5, &q6, &q7); if (overflow) { vp9_highbd_fdct8x8_c(input, output, stride); return; @@ -329,7 +330,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i r2 = SUB_EPI16(q1, q2); const __m128i r3 = SUB_EPI16(q0, q3); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(r0, r1, r2, r3); + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); if (overflow) { vp9_highbd_fdct8x8_c(input, output, stride); return; @@ -372,7 +373,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { res2 = _mm_packs_epi32(w4, w5); res6 = _mm_packs_epi32(w6, w7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(res0, res4, res2, res6); + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); if (overflow) { vp9_highbd_fdct8x8_c(input, output, stride); return; @@ -402,7 +403,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(r0, r1); + overflow = check_epi16_overflow_x2(&r0, &r1); if (overflow) { vp9_highbd_fdct8x8_c(input, output, stride); return; @@ -415,7 +416,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i x2 = SUB_EPI16(q7, r1); const __m128i x3 = ADD_EPI16(q7, r1); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(x0, x1, x2, x3); + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); if (overflow) { vp9_highbd_fdct8x8_c(input, output, stride); return; @@ -458,7 +459,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { res5 = _mm_packs_epi32(w4, w5); res3 = _mm_packs_epi32(w6, w7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(res1, res7, res5, res3); + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); if (overflow) { vp9_highbd_fdct8x8_c(input, output, stride); return; @@ -557,14 +558,14 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results - store_output(in0, (output + 0 * 8)); - store_output(in1, (output + 1 * 8)); - store_output(in2, (output + 2 * 8)); - store_output(in3, (output + 3 * 8)); - store_output(in4, (output + 4 * 8)); - store_output(in5, (output + 5 * 8)); - store_output(in6, (output + 6 * 8)); - store_output(in7, (output + 7 * 8)); + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); } } @@ -720,8 +721,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { input6 = ADD_EPI16(in06, in09); input7 = ADD_EPI16(in07, in08); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(input0, input1, input2, input3, - input4, input5, input6, input7); + overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, + &input4, &input5, &input6, &input7); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -739,8 +740,10 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step1_6 = SUB_EPI16(in01, in14); step1_7 = SUB_EPI16(in00, in15); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1_0, step1_1, step1_2, step1_3, - step1_4, step1_5, step1_6, step1_7); + overflow = check_epi16_overflow_x8(&step1_0, &step1_1, + &step1_2, &step1_3, + &step1_4, &step1_5, + &step1_6, &step1_7); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -759,7 +762,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i q6 = SUB_EPI16(input1, input6); const __m128i q7 = SUB_EPI16(input0, input7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(q0, q1, q2, q3, q4, q5, q6, q7); + overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, + &q4, &q5, &q6, &q7); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -773,7 +777,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i r2 = SUB_EPI16(q1, q2); const __m128i r3 = SUB_EPI16(q0, q3); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(r0, r1, r2, r3); + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -786,16 +790,16 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - res00 = mult_round_shift(t0, t1, k__cospi_p16_p16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res08 = mult_round_shift(t0, t1, k__cospi_p16_m16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res04 = mult_round_shift(t2, t3, k__cospi_p24_p08, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res12 = mult_round_shift(t2, t3, k__cospi_m08_p24, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(res00, res08, res04, res12); + overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -809,12 +813,14 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i r0 = mult_round_shift(d0, d1, k__cospi_p16_m16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - const __m128i r1 = mult_round_shift(d0, d1, k__cospi_p16_p16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, + DCT_CONST_BITS); + const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, + DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(r0, r1); + overflow = check_epi16_overflow_x2(&r0, &r1); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -827,7 +833,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i x2 = SUB_EPI16(q7, r1); const __m128i x3 = ADD_EPI16(q7, r1); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(x0, x1, x2, x3); + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -840,16 +846,17 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - res02 = mult_round_shift(t0, t1, k__cospi_p28_p04, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res14 = mult_round_shift(t0, t1, k__cospi_m04_p28, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res10 = mult_round_shift(t2, t3, k__cospi_p12_p20, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res06 = mult_round_shift(t2, t3, k__cospi_m20_p12, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(res02, res14, res10, res06); + overflow = check_epi16_overflow_x4(&res02, &res14, + &res10, &res06); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -867,17 +874,17 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - step2_2 = mult_round_shift(t0, t1, k__cospi_p16_m16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_3 = mult_round_shift(t2, t3, k__cospi_p16_m16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(t0, t1, k__cospi_p16_p16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_4 = mult_round_shift(t2, t3, k__cospi_p16_p16, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(step2_2, step2_3, step2_5, - step2_4); + overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, + &step2_4); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -895,8 +902,10 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step3_6 = ADD_EPI16(step1_6, step2_5); step3_7 = ADD_EPI16(step1_7, step2_4); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step3_0, step3_1, step3_2, step3_3, - step3_4, step3_5, step3_6, step3_7); + overflow = check_epi16_overflow_x8(&step3_0, &step3_1, + &step3_2, &step3_3, + &step3_4, &step3_5, + &step3_6, &step3_7); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -909,17 +918,17 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - step2_1 = mult_round_shift(t0, t1, k__cospi_m08_p24, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_2 = mult_round_shift(t2, t3, k__cospi_p24_p08, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_6 = mult_round_shift(t0, t1, k__cospi_p24_p08, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(t2, t3, k__cospi_p08_m24, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(step2_1, step2_2, step2_6, - step2_5); + overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, + &step2_5); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -937,8 +946,10 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step1_6 = SUB_EPI16(step3_7, step2_6); step1_7 = ADD_EPI16(step3_7, step2_6); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(step1_0, step1_1, step1_2, step1_3, - step1_4, step1_5, step1_6, step1_7); + overflow = check_epi16_overflow_x8(&step1_0, &step1_1, + &step1_2, &step1_3, + &step1_4, &step1_5, + &step1_6, &step1_7); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -951,16 +962,16 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - res01 = mult_round_shift(t0, t1, k__cospi_p30_p02, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res09 = mult_round_shift(t2, t3, k__cospi_p14_p18, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res15 = mult_round_shift(t0, t1, k__cospi_m02_p30, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res07 = mult_round_shift(t2, t3, k__cospi_m18_p14, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(res01, res09, res15, res07); + overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -972,16 +983,16 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - res05 = mult_round_shift(t0, t1, k__cospi_p22_p10, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res13 = mult_round_shift(t2, t3, k__cospi_p06_p26, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res11 = mult_round_shift(t0, t1, k__cospi_m10_p22, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res03 = mult_round_shift(t2, t3, k__cospi_m26_p06, - k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(res05, res13, res11, res03); + overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); if (overflow) { vp9_highbd_fdct16x16_c(input, output, stride); return; @@ -990,11 +1001,11 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { } } // Transpose the results, do it as two 8x8 transposes. - transpose_and_output8x8(res00, res01, res02, res03, - res04, res05, res06, res07, + transpose_and_output8x8(&res00, &res01, &res02, &res03, + &res04, &res05, &res06, &res07, pass, out0, out1); - transpose_and_output8x8(res08, res09, res10, res11, - res12, res13, res14, res15, + transpose_and_output8x8(&res08, &res09, &res10, &res11, + &res12, &res13, &res14, &res15, pass, out0 + 8, out1 + 8); if (pass == 0) { out0 += 8*16; diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index 81da34306..e671f3998 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -40,7 +40,7 @@ void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { in1 = _mm_add_epi32(tmp, in0); in0 = _mm_slli_epi32(in1, 1); - store_output(in0, output); + store_output(&in0, output); } static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, @@ -72,8 +72,8 @@ static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { __m128i out23 = _mm_add_epi16(in23, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); - store_output(out01, (output + 0 * 8)); - store_output(out23, (output + 1 * 8)); + store_output(&out01, (output + 0 * 8)); + store_output(&out23, (output + 1 * 8)); } static INLINE void transpose_4x4(__m128i *res) { @@ -245,7 +245,7 @@ void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { in0 = _mm_srli_si128(sum, 8); in1 = _mm_add_epi32(sum, in0); - store_output(in1, output); + store_output(&in1, output); } void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, @@ -759,14 +759,14 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) { // write 8x8 array static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, int stride) { - store_output(res[0], (output + 0 * stride)); - store_output(res[1], (output + 1 * stride)); - store_output(res[2], (output + 2 * stride)); - store_output(res[3], (output + 3 * stride)); - store_output(res[4], (output + 4 * stride)); - store_output(res[5], (output + 5 * stride)); - store_output(res[6], (output + 6 * stride)); - store_output(res[7], (output + 7 * stride)); + store_output(&res[0], (output + 0 * stride)); + store_output(&res[1], (output + 1 * stride)); + store_output(&res[2], (output + 2 * stride)); + store_output(&res[3], (output + 3 * stride)); + store_output(&res[4], (output + 4 * stride)); + store_output(&res[5], (output + 5 * stride)); + store_output(&res[6], (output + 6 * stride)); + store_output(&res[7], (output + 7 * stride)); } // perform in-place transpose @@ -1292,7 +1292,7 @@ void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, in1 = _mm_add_epi32(sum, in0); in1 = _mm_srai_epi32(in1, 1); - store_output(in1, output); + store_output(&in1, output); } static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, @@ -2251,7 +2251,7 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, in1 = _mm_add_epi32(sum, in0); in1 = _mm_srai_epi32(in1, 3); - store_output(in1, output); + store_output(&in1, output); } #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/x86/vp9_dct_sse2.h b/vp9/encoder/x86/vp9_dct_sse2.h index 2d322103e..b99db923e 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.h +++ b/vp9/encoder/x86/vp9_dct_sse2.h @@ -43,99 +43,144 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { return _mm_unpacklo_epi64(buf0, buf1); } -static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) { +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { const __m128i max_overflow = _mm_set1_epi16(0x7fff); const __m128i min_overflow = _mm_set1_epi16(0x8000); - __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow), - _mm_cmpeq_epi16(reg0, min_overflow)); - __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow), - _mm_cmpeq_epi16(reg1, min_overflow)); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); cmp0 = _mm_or_si128(cmp0, cmp1); return _mm_movemask_epi8(cmp0); } -static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3) { +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { const __m128i max_overflow = _mm_set1_epi16(0x7fff); const __m128i min_overflow = _mm_set1_epi16(0x8000); - __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow), - _mm_cmpeq_epi16(reg0, min_overflow)); - __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow), - _mm_cmpeq_epi16(reg1, min_overflow)); - __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow), - _mm_cmpeq_epi16(reg2, min_overflow)); - __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow), - _mm_cmpeq_epi16(reg3, min_overflow)); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); return _mm_movemask_epi8(cmp0); } -static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3, - __m128i reg4, __m128i reg5, - __m128i reg6, __m128i reg7) { +static INLINE int check_epi16_overflow_x8(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7) { int res0, res1; - res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3); - res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7); + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); return res0 + res1; } -static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3, __m128i reg4, - __m128i reg5, __m128i reg6, __m128i reg7, - __m128i reg8, __m128i reg9, __m128i reg10, - __m128i reg11) { +static INLINE int check_epi16_overflow_x12(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7, + const __m128i *preg8, + const __m128i *preg9, + const __m128i *preg10, + const __m128i *preg11) { int res0, res1; - res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3); - res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7); + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) - res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11); + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); return res0 + res1; } -static INLINE int check_epi16_overflow_x16(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3, __m128i reg4, - __m128i reg5, __m128i reg6, __m128i reg7, - __m128i reg8, __m128i reg9, __m128i reg10, - __m128i reg11, __m128i reg12, __m128i reg13, - __m128i reg14, __m128i reg15) { +static INLINE int check_epi16_overflow_x16(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7, + const __m128i *preg8, + const __m128i *preg9, + const __m128i *preg10, + const __m128i *preg11, + const __m128i *preg12, + const __m128i *preg13, + const __m128i *preg14, + const __m128i *preg15) { int res0, res1; - res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3); - res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7); + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) { - res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11); + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); if (!res1) - res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15); + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); } return res0 + res1; } -static INLINE int check_epi16_overflow_x32(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3, __m128i reg4, - __m128i reg5, __m128i reg6, __m128i reg7, - __m128i reg8, __m128i reg9, __m128i reg10, - __m128i reg11, __m128i reg12, __m128i reg13, - __m128i reg14, __m128i reg15, __m128i reg16, - __m128i reg17, __m128i reg18, __m128i reg19, - __m128i reg20, __m128i reg21, __m128i reg22, - __m128i reg23, __m128i reg24, __m128i reg25, - __m128i reg26, __m128i reg27, __m128i reg28, - __m128i reg29, __m128i reg30, __m128i reg31) { +static INLINE int check_epi16_overflow_x32(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7, + const __m128i *preg8, + const __m128i *preg9, + const __m128i *preg10, + const __m128i *preg11, + const __m128i *preg12, + const __m128i *preg13, + const __m128i *preg14, + const __m128i *preg15, + const __m128i *preg16, + const __m128i *preg17, + const __m128i *preg18, + const __m128i *preg19, + const __m128i *preg20, + const __m128i *preg21, + const __m128i *preg22, + const __m128i *preg23, + const __m128i *preg24, + const __m128i *preg25, + const __m128i *preg26, + const __m128i *preg27, + const __m128i *preg28, + const __m128i *preg29, + const __m128i *preg30, + const __m128i *preg31) { int res0, res1; - res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3); - res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7); + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) { - res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11); + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); if (!res1) { - res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15); + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); if (!res0) { - res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19); + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); if (!res1) { - res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23); + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); if (!res0) { - res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27); + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); if (!res1) - res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31); + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); } } } @@ -144,14 +189,17 @@ static INLINE int check_epi16_overflow_x32(__m128i reg0, __m128i reg1, return res0 + res1; } -static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3, const __m128i* zero) { +static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *zero) { __m128i minus_one = _mm_set1_epi32(-1); // Check for overflows - __m128i reg0_shifted = _mm_slli_epi64(reg0, 1); - __m128i reg1_shifted = _mm_slli_epi64(reg1, 1); - __m128i reg2_shifted = _mm_slli_epi64(reg2, 1); - __m128i reg3_shifted = _mm_slli_epi64(reg3, 1); + __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); + __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); + __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); + __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); __m128i reg0_top_dwords = _mm_shuffle_epi32( reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); __m128i reg1_top_dwords = _mm_shuffle_epi32( @@ -173,65 +221,107 @@ static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1, return (overflow_01 + overflow_23); } -static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1, - __m128i reg2, __m128i reg3, - __m128i reg4, __m128i reg5, - __m128i reg6, __m128i reg7, - const __m128i* zero) { - int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero); +static INLINE int k_check_epi32_overflow_8(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7, + const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero); + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); } return overflow; } -static INLINE int k_check_epi32_overflow_16( - __m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3, - __m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7, - __m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11, - __m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15, - const __m128i* zero) { - int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero); +static INLINE int k_check_epi32_overflow_16(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7, + const __m128i *preg8, + const __m128i *preg9, + const __m128i *preg10, + const __m128i *preg11, + const __m128i *preg12, + const __m128i *preg13, + const __m128i *preg14, + const __m128i *preg15, + const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero); + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero); + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, + zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero); + overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, + zero); } } } return overflow; } -static INLINE int k_check_epi32_overflow_32( - __m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3, - __m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7, - __m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11, - __m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15, - __m128i reg16, __m128i reg17, __m128i reg18, __m128i reg19, - __m128i reg20, __m128i reg21, __m128i reg22, __m128i reg23, - __m128i reg24, __m128i reg25, __m128i reg26, __m128i reg27, - __m128i reg28, __m128i reg29, __m128i reg30, __m128i reg31, - const __m128i* zero) { - int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero); +static INLINE int k_check_epi32_overflow_32(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *preg4, + const __m128i *preg5, + const __m128i *preg6, + const __m128i *preg7, + const __m128i *preg8, + const __m128i *preg9, + const __m128i *preg10, + const __m128i *preg11, + const __m128i *preg12, + const __m128i *preg13, + const __m128i *preg14, + const __m128i *preg15, + const __m128i *preg16, + const __m128i *preg17, + const __m128i *preg18, + const __m128i *preg19, + const __m128i *preg20, + const __m128i *preg21, + const __m128i *preg22, + const __m128i *preg23, + const __m128i *preg24, + const __m128i *preg25, + const __m128i *preg26, + const __m128i *preg27, + const __m128i *preg28, + const __m128i *preg29, + const __m128i *preg30, + const __m128i *preg31, + const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero); + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero); + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero); + overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, + zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero); + overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, + zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg20, reg21, - reg22, reg23, zero); + overflow = k_check_epi32_overflow_4(preg20, preg21, + preg22, preg23, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg24, reg25, - reg26, reg27, zero); + overflow = k_check_epi32_overflow_4(preg24, preg25, + preg26, preg27, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(reg28, reg29, - reg30, reg31, zero); + overflow = k_check_epi32_overflow_4(preg28, preg29, + preg30, preg31, zero); } } } @@ -242,51 +332,52 @@ static INLINE int k_check_epi32_overflow_32( return overflow; } -static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) { +static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) { #if CONFIG_VP9_HIGHBITDEPTH const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(output, zero); - __m128i out0 = _mm_unpacklo_epi16(output, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(output, sign_bits); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); _mm_store_si128((__m128i *)(dst_ptr), out0); _mm_store_si128((__m128i *)(dst_ptr + 4), out1); #else - _mm_store_si128((__m128i *)(dst_ptr), output); + _mm_store_si128((__m128i *)(dst_ptr), *poutput); #endif // CONFIG_VP9_HIGHBITDEPTH } -static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) { +static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) { #if CONFIG_VP9_HIGHBITDEPTH const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(output, zero); - __m128i out0 = _mm_unpacklo_epi16(output, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(output, sign_bits); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); _mm_storeu_si128((__m128i *)(dst_ptr), out0); _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); #else - _mm_storeu_si128((__m128i *)(dst_ptr), output); + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); #endif // CONFIG_VP9_HIGHBITDEPTH } -static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1, - const __m128i multiplier, - const __m128i rounding, +static INLINE __m128i mult_round_shift(const __m128i *pin0, + const __m128i *pin1, + const __m128i *pmultiplier, + const __m128i *prounding, const int shift) { - const __m128i u0 = _mm_madd_epi16(in0, multiplier); - const __m128i u1 = _mm_madd_epi16(in1, multiplier); - const __m128i v0 = _mm_add_epi32(u0, rounding); - const __m128i v1 = _mm_add_epi32(u1, rounding); + const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); + const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); + const __m128i v0 = _mm_add_epi32(u0, *prounding); + const __m128i v1 = _mm_add_epi32(u1, *prounding); const __m128i w0 = _mm_srai_epi32(v0, shift); const __m128i w1 = _mm_srai_epi32(v1, shift); return _mm_packs_epi32(w0, w1); } static INLINE void transpose_and_output8x8( - const __m128i in00, const __m128i in01, - const __m128i in02, const __m128i in03, - const __m128i in04, const __m128i in05, - const __m128i in06, const __m128i in07, + const __m128i *pin00, const __m128i *pin01, + const __m128i *pin02, const __m128i *pin03, + const __m128i *pin04, const __m128i *pin05, + const __m128i *pin06, const __m128i *pin07, const int pass, int16_t* out0_ptr, tran_low_t* out1_ptr) { // 00 01 02 03 04 05 06 07 @@ -297,14 +388,14 @@ static INLINE void transpose_and_output8x8( // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01); - const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03); - const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01); - const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03); - const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05); - const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07); - const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05); - const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07); + const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); + const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); + const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); + const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); + const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); + const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); + const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); + const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 @@ -355,14 +446,14 @@ static INLINE void transpose_and_output8x8( _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6); _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7); } else { - storeu_output(tr2_0, (out1_ptr + 0 * 16)); - storeu_output(tr2_1, (out1_ptr + 1 * 16)); - storeu_output(tr2_2, (out1_ptr + 2 * 16)); - storeu_output(tr2_3, (out1_ptr + 3 * 16)); - storeu_output(tr2_4, (out1_ptr + 4 * 16)); - storeu_output(tr2_5, (out1_ptr + 5 * 16)); - storeu_output(tr2_6, (out1_ptr + 6 * 16)); - storeu_output(tr2_7, (out1_ptr + 7 * 16)); + storeu_output(&tr2_0, (out1_ptr + 0 * 16)); + storeu_output(&tr2_1, (out1_ptr + 1 * 16)); + storeu_output(&tr2_2, (out1_ptr + 2 * 16)); + storeu_output(&tr2_3, (out1_ptr + 3 * 16)); + storeu_output(&tr2_4, (out1_ptr + 4 * 16)); + storeu_output(&tr2_5, (out1_ptr + 5 * 16)); + storeu_output(&tr2_6, (out1_ptr + 6 * 16)); + storeu_output(&tr2_7, (out1_ptr + 7 * 16)); } }