diff --git a/src/dsp/rescaler_sse2.c b/src/dsp/rescaler_sse2.c index 57f4c818..4ed6d37b 100644 --- a/src/dsp/rescaler_sse2.c +++ b/src/dsp/rescaler_sse2.c @@ -27,7 +27,7 @@ #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0 -static void LoadTwoPixels(const uint8_t* const src, __m128i* out) { +static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0 @@ -36,7 +36,7 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) { } // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0 -static void LoadHeightPixels(const uint8_t* const src, __m128i* out) { +static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH *out = _mm_unpacklo_epi8(A, zero); @@ -57,7 +57,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, WebPRescalerImportRowExpand_C(wrk, src); return; } - LoadTwoPixels(src, &cur_pixels); + LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; while (1) { const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum); @@ -67,7 +67,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { - LoadTwoPixels(src, &cur_pixels); + LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; accum += x_add; } @@ -79,7 +79,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, WebPRescalerImportRowExpand_C(wrk, src); return; } - LoadHeightPixels(src, &cur_pixels); + LoadHeightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; while (1) { @@ -94,7 +94,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, if (--left) { cur_pixels = _mm_srli_si128(cur_pixels, 2); } else if (src <= src_limit) { - LoadHeightPixels(src, &cur_pixels); + LoadHeightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; } else { // tail @@ -169,12 +169,12 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, // Row export // load *src as epi64, multiply by mult and store result in [out0 ... out3] -static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, - const __m128i* const mult, - __m128i* const out0, - __m128i* const out1, - __m128i* const out2, - __m128i* const out3) { +static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src, + const __m128i* const mult, + __m128i* const out0, + __m128i* const out1, + __m128i* const out2, + __m128i* const out3) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4)); const __m128i A2 = _mm_srli_epi64(A0, 32); @@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, } } -static WEBP_INLINE void ProcessRow(const __m128i* const A0, - const __m128i* const A1, - const __m128i* const A2, - const __m128i* const A3, - const __m128i* const mult, - uint8_t* const dst) { +static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0, + const __m128i* const A1, + const __m128i* const A2, + const __m128i* const A3, + const __m128i* const mult, + uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); @@ -240,8 +240,8 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) { if (wrk->y_accum == 0) { for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; - LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3); - ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); + LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3); + ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const uint32_t J = frow[x_out]; @@ -257,8 +257,8 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; - LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3); - LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3); + LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(A0, B0); const __m128i C1 = _mm_add_epi64(A1, B1); @@ -272,7 +272,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) { const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX); const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX); const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX); - ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out); + ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { @@ -303,8 +303,8 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; - LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); - LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); + LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(B0, rounder); const __m128i C1 = _mm_add_epi64(B1, rounder); @@ -324,7 +324,7 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { const __m128i G1 = _mm_or_si128(D1, F3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0); _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1); - ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); + ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { @@ -340,10 +340,10 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { const __m128i zero = _mm_setzero_si128(); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; - LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero); _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero); - ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); + ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const int v = (int)MULT_FIX(irow[x_out], scale);