From 2835089d6a92aacb84505b22232a5b86b1640c61 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Fri, 11 Dec 2015 15:12:19 +0100 Subject: [PATCH] Provide an SSE2 implementation of CombinedShannonEntropy. CombinedShannonEntropy takes 30% for lossless compression. This implementation speeds up the overall process by 2 to 3 %. Change-Id: I04a71743284c38814fd0726034d51a02b1b6ba8f --- src/dsp/lossless.h | 3 ++ src/dsp/lossless_enc.c | 14 ++++--- src/dsp/lossless_enc_sse2.c | 75 +++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 6 deletions(-) diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index b59d0fba..a44659cf 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -199,9 +199,12 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) { typedef double (*VP8LCostFunc)(const uint32_t* population, int length); typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, int length); +typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256], + const int Y[256]); extern VP8LCostFunc VP8LExtraCost; extern VP8LCostCombinedFunc VP8LExtraCostCombined; +extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; typedef struct { // small struct to hold counters int counts[2]; // index: 0=zero steak, 1=non-zero streak diff --git a/src/dsp/lossless_enc.c b/src/dsp/lossless_enc.c index d75b782c..1be75e24 100644 --- a/src/dsp/lossless_enc.c +++ b/src/dsp/lossless_enc.c @@ -413,15 +413,15 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) { int sumX = 0, sumXY = 0; for (i = 0; i < 256; ++i) { const int x = X[i]; - const int xy = x + Y[i]; if (x != 0) { + const int xy = x + Y[i]; sumX += x; retval -= VP8LFastSLog2(x); sumXY += xy; retval -= VP8LFastSLog2(xy); - } else if (xy != 0) { - sumXY += xy; - retval -= VP8LFastSLog2(xy); + } else if (Y[i] != 0) { + sumXY += Y[i]; + retval -= VP8LFastSLog2(Y[i]); } } retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); @@ -435,7 +435,7 @@ static float PredictionCostSpatialHistogram(const int accumulated[4][256], for (i = 0; i < 4; ++i) { const double kExpValue = 0.94; retval += PredictionCostSpatial(tile[i], 1, kExpValue); - retval += CombinedShannonEntropy(tile[i], accumulated[i]); + retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]); } return (float)retval; } @@ -894,7 +894,7 @@ static float PredictionCostCrossColor(const int accumulated[256], // Favor low entropy, locally and globally. // Favor small absolute values for PredictionCostSpatial static const double kExpValue = 2.4; - return CombinedShannonEntropy(counts, accumulated) + + return VP8LCombinedShannonEntropy(counts, accumulated) + PredictionCostSpatial(counts, 3, kExpValue); } @@ -1269,6 +1269,7 @@ VP8LFastLog2SlowFunc VP8LFastSLog2Slow; VP8LCostFunc VP8LExtraCost; VP8LCostCombinedFunc VP8LExtraCostCombined; +VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; VP8LCostCountFunc VP8LHuffmanCostCount; @@ -1300,6 +1301,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { VP8LExtraCost = ExtraCost; VP8LExtraCostCombined = ExtraCostCombined; + VP8LCombinedShannonEntropy = CombinedShannonEntropy; VP8LHuffmanCostCount = HuffmanCostCount; diff --git a/src/dsp/lossless_enc_sse2.c b/src/dsp/lossless_enc_sse2.c index 1374b3ef..e8c98341 100644 --- a/src/dsp/lossless_enc_sse2.c +++ b/src/dsp/lossless_enc_sse2.c @@ -250,6 +250,80 @@ static void HistogramAdd(const VP8LHistogram* const a, } } +//------------------------------------------------------------------------------ +// Entropy + +// Checks whether the X or Y contribution is worth computing and adding. +// Used in loop unrolling. +#define ANALYZE_X_OR_Y(x_or_y, j) \ + do { \ + if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \ + } while (0) + +// Checks whether the X + Y contribution is worth computing and adding. +// Used in loop unrolling. +#define ANALYZE_XY(j) \ + do { \ + if (tmp[j] != 0) { \ + retval -= VP8LFastSLog2(tmp[j]); \ + ANALYZE_X_OR_Y(X, j); \ + } \ + } while (0) + +static float CombinedShannonEntropy(const int X[256], const int Y[256]) { + int i; + double retval = 0.; + int sumX, sumXY; + int32_t tmp[4]; + __m128i zero = _mm_setzero_si128(); + // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY). + __m128i sumXY_128 = zero; + __m128i sumX_128 = zero; + + for (i = 0; i < 256; i += 4) { + const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); + const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); + + // Check if any X is non-zero: this actually provides a speedup as X is + // usually sparse. + if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) { + const __m128i xy_128 = _mm_add_epi32(x, y); + sumXY_128 = _mm_add_epi32(sumXY_128, xy_128); + + sumX_128 = _mm_add_epi32(sumX_128, x); + + // Analyze the different X + Y. + _mm_storeu_si128((__m128i*)tmp, xy_128); + + ANALYZE_XY(0); + ANALYZE_XY(1); + ANALYZE_XY(2); + ANALYZE_XY(3); + } else { + // X is fully 0, so only deal with Y. + sumXY_128 = _mm_add_epi32(sumXY_128, y); + + ANALYZE_X_OR_Y(Y, 0); + ANALYZE_X_OR_Y(Y, 1); + ANALYZE_X_OR_Y(Y, 2); + ANALYZE_X_OR_Y(Y, 3); + } + } + + // Sum up sumX_128 to get sumX. + _mm_storeu_si128((__m128i*)tmp, sumX_128); + sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + + // Sum up sumXY_128 to get sumXY. + _mm_storeu_si128((__m128i*)tmp, sumXY_128); + sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + + retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); + return (float)retval; +} +#undef ANALYZE_X_OR_Y +#undef ANALYZE_XY + //------------------------------------------------------------------------------ // Entry point @@ -261,6 +335,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; VP8LCollectColorRedTransforms = CollectColorRedTransforms; VP8LHistogramAdd = HistogramAdd; + VP8LCombinedShannonEntropy = CombinedShannonEntropy; } #else // !WEBP_USE_SSE2