From 1cb638010c5c2927cb7c8f41f41403cac6f36a27 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Wed, 21 Dec 2016 14:25:54 +0100 Subject: [PATCH] Call the C function to finish off lossless SSE loops only when necessary. Change-Id: I4e221d80879dc9c90c24d69a40bc5811d73787ad --- src/dsp/lossless_enc_sse2.c | 40 +++++++++++++++++++------- src/dsp/lossless_enc_sse41.c | 4 ++- src/dsp/lossless_sse2.c | 56 +++++++++++++++++++++++++++--------- 3 files changed, 75 insertions(+), 25 deletions(-) diff --git a/src/dsp/lossless_enc_sse2.c b/src/dsp/lossless_enc_sse2.c index 2c725253..a7c67f45 100644 --- a/src/dsp/lossless_enc_sse2.c +++ b/src/dsp/lossless_enc_sse2.c @@ -37,7 +37,9 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C - VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); + if (i != num_pixels) { + VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); + } } //------------------------------------------------------------------------------ @@ -71,7 +73,9 @@ static void TransformColor(const VP8LMultipliers* const m, _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C - VP8LTransformColor_C(m, argb_data + i, num_pixels - i); + if (i != num_pixels) { + VP8LTransformColor_C(m, argb_data + i, num_pixels - i); + } } //------------------------------------------------------------------------------ @@ -477,7 +481,9 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i res = _mm_sub_epi8(src, black); _mm_storeu_si128((__m128i*)&out[i], res); } - VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i); + } } #define GENERATE_PREDICTOR_1(X, IN) \ @@ -490,7 +496,9 @@ static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ const __m128i res = _mm_sub_epi8(src, pred); \ _mm_storeu_si128((__m128i*)&out[i], res); \ } \ - VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + if (i != num_pixels) { \ + VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + } \ } GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L @@ -514,7 +522,9 @@ static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper, res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } - VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i); + } } #define GENERATE_PREDICTOR_2(X, A, B) \ @@ -530,7 +540,9 @@ static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ res = _mm_sub_epi8(src, pred); \ _mm_storeu_si128((__m128i*)&out[i], res); \ } \ - VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + if (i != num_pixels) { \ + VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + } \ } GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL) @@ -556,7 +568,9 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper, res = _mm_sub_epi8(src, avg); _mm_storeu_si128((__m128i*)&out[i], res); } - VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i); + } } // Predictor11: select. @@ -593,7 +607,9 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, _mm_storeu_si128((__m128i*)&out[i], res); } } - VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); + } } // Predictor12: ClampedSubSubtractFull. @@ -620,7 +636,9 @@ static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } - VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i); + } } // Predictors13: ClampedAddSubtractHalf @@ -648,7 +666,9 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i res = _mm_sub_epi8(src, pred); _mm_storel_epi64((__m128i*)&out[i], res); } - VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); + } } //------------------------------------------------------------------------------ diff --git a/src/dsp/lossless_enc_sse41.c b/src/dsp/lossless_enc_sse41.c index 3e493198..821057cc 100644 --- a/src/dsp/lossless_enc_sse41.c +++ b/src/dsp/lossless_enc_sse41.c @@ -32,7 +32,9 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C - VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); + if (i != num_pixels) { + VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); + } } //------------------------------------------------------------------------------ diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 23a7e2d0..15aae938 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -186,7 +186,9 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i res = _mm_add_epi8(src, black); _mm_storeu_si128((__m128i*)&out[i], res); } - VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); + } } // Predictor1: left. @@ -210,7 +212,9 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, // replicate prev output on the four lanes prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6)); } - VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); + } } // Macro that adds 32-bit integers from IN using mod 256 arithmetic @@ -225,7 +229,9 @@ static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ const __m128i res = _mm_add_epi8(src, other); \ _mm_storeu_si128((__m128i*)&out[i], res); \ } \ - VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + if (i != num_pixels) { \ + VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + } \ } // Predictor2: Top. @@ -255,7 +261,9 @@ static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ res = _mm_add_epi8(avg, src); \ _mm_storeu_si128((__m128i*)&out[i], res); \ } \ - VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + if (i != num_pixels) { \ + VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ + } \ } // Predictor8: average TL T. GENERATE_PREDICTOR_2(8, upper[i - 1]) @@ -287,7 +295,9 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, src = _mm_srli_si128(src, 4); } } - VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); + } } // Predictor11: select. @@ -331,7 +341,9 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, pa = _mm_srli_si128(pa, 4); } } - VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); + } } // Predictor12: ClampedAddSubtractFull. @@ -369,7 +381,9 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, DO_PRED12(diff_hi, 0, 2); DO_PRED12(diff_hi, 1, 3); } - VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); + if (i != num_pixels) { + VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); + } } #undef DO_PRED12 @@ -392,7 +406,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, _mm_storeu_si128((__m128i*)&dst[i], out); } // fallthrough and finish off with plain-C - VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); + if (i != num_pixels) { + VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); + } } //------------------------------------------------------------------------------ @@ -430,7 +446,9 @@ static void TransformColorInverse(const VP8LMultipliers* const m, _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. - VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); + if (i != num_pixels) { + VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); + } } //------------------------------------------------------------------------------ @@ -467,7 +485,9 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, num_pixels -= 32; } // left-overs - VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + if (num_pixels > 0) { + VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + } } static void ConvertBGRAToRGBA(const uint32_t* src, @@ -494,7 +514,9 @@ static void ConvertBGRAToRGBA(const uint32_t* src, num_pixels -= 8; } // left-overs - VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + if (num_pixels > 0) { + VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + } } static void ConvertBGRAToRGBA4444(const uint32_t* src, @@ -528,7 +550,9 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, num_pixels -= 8; } // left-overs - VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + if (num_pixels > 0) { + VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + } } static void ConvertBGRAToRGB565(const uint32_t* src, @@ -567,7 +591,9 @@ static void ConvertBGRAToRGB565(const uint32_t* src, num_pixels -= 8; } // left-overs - VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + if (num_pixels > 0) { + VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + } } static void ConvertBGRAToBGR(const uint32_t* src, @@ -598,7 +624,9 @@ static void ConvertBGRAToBGR(const uint32_t* src, num_pixels -= 8; } // left-overs - VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); + if (num_pixels > 0) { + VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); + } } //------------------------------------------------------------------------------