diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index 88762cfa..98b44ac7 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -332,14 +332,14 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = { #define APPROX_LOG_WITH_CORRECTION_MAX 65536 #define APPROX_LOG_MAX 4096 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 -static float FastSLog2Slow(int v) { +static float FastSLog2Slow(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; - int y = 1; + uint32_t y = 1; int correction = 0; const float v_f = (float)v; - const int orig_v = v; + const uint32_t orig_v = v; do { ++log_cnt; v = v >> 1; @@ -358,12 +358,12 @@ static float FastSLog2Slow(int v) { } } -static float FastLog2Slow(int v) { +static float FastLog2Slow(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; - int y = 1; - const int orig_v = v; + uint32_t y = 1; + const uint32_t orig_v = v; double log_2; do { ++log_cnt; @@ -1437,6 +1437,7 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels, } } +//------------------------------------------------------------------------------ // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. void VP8LBundleColorMap(const uint8_t* const row, int width, int xbits, uint32_t* const dst) { @@ -1458,14 +1459,17 @@ void VP8LBundleColorMap(const uint8_t* const row, int width, } } -static double ExtraCost(const int* population, int length) { +//------------------------------------------------------------------------------ + +static double ExtraCost(const uint32_t* population, int length) { int i; double cost = 0.; for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2]; return cost; } -static double ExtraCostCombined(const int* X, const int* Y, int length) { +static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y, + int length) { int i; double cost = 0.; for (i = 2; i < length - 2; ++i) { @@ -1476,7 +1480,7 @@ static double ExtraCostCombined(const int* X, const int* Y, int length) { } // Returns the various RLE counts -static VP8LStreaks HuffmanCostCount(const int* population, int length) { +static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) { int i; int streak = 0; VP8LStreaks stats; @@ -1496,8 +1500,8 @@ static VP8LStreaks HuffmanCostCount(const int* population, int length) { return stats; } -static VP8LStreaks HuffmanCostCombinedCount(const int* X, const int* Y, - int length) { +static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X, + const uint32_t* Y, int length) { int i; int streak = 0; VP8LStreaks stats; @@ -1524,6 +1528,41 @@ static VP8LStreaks HuffmanCostCombinedCount(const int* X, const int* Y, //------------------------------------------------------------------------------ +static void HistogramAdd(const VP8LHistogram* const a, + const VP8LHistogram* const b, + VP8LHistogram* const out) { + int i; + const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); + assert(a->palette_code_bits_ == b->palette_code_bits_); + if (b != out) { + for (i = 0; i < literal_size; ++i) { + out->literal_[i] = a->literal_[i] + b->literal_[i]; + } + for (i = 0; i < NUM_DISTANCE_CODES; ++i) { + out->distance_[i] = a->distance_[i] + b->distance_[i]; + } + for (i = 0; i < NUM_LITERAL_CODES; ++i) { + out->red_[i] = a->red_[i] + b->red_[i]; + out->blue_[i] = a->blue_[i] + b->blue_[i]; + out->alpha_[i] = a->alpha_[i] + b->alpha_[i]; + } + } else { + for (i = 0; i < literal_size; ++i) { + out->literal_[i] += a->literal_[i]; + } + for (i = 0; i < NUM_DISTANCE_CODES; ++i) { + out->distance_[i] += a->distance_[i]; + } + for (i = 0; i < NUM_LITERAL_CODES; ++i) { + out->red_[i] += a->red_[i]; + out->blue_[i] += a->blue_[i]; + out->alpha_[i] += a->alpha_[i]; + } + } +} + +//------------------------------------------------------------------------------ + VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed; VP8LPredictorFunc VP8LPredictors[16]; @@ -1546,6 +1585,8 @@ VP8LCostCombinedFunc VP8LExtraCostCombined; VP8LCostCountFunc VP8LHuffmanCostCount; VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount; +VP8LHistogramAddFunc VP8LHistogramAdd; + extern void VP8LDspInitSSE2(void); extern void VP8LDspInitNEON(void); extern void VP8LDspInitMIPS32(void); @@ -1574,6 +1615,8 @@ void VP8LDspInit(void) { VP8LHuffmanCostCount = HuffmanCostCount; VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount; + VP8LHistogramAdd = HistogramAdd; + // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index 8839eca6..b99cc09b 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -18,6 +18,8 @@ #include "../webp/types.h" #include "../webp/decode.h" +#include "../enc/histogram.h" + #ifdef __cplusplus extern "C" { #endif @@ -123,24 +125,25 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size, #define LOG_LOOKUP_IDX_MAX 256 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX]; extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX]; -typedef float (*VP8LFastLog2SlowFunc)(int v); +typedef float (*VP8LFastLog2SlowFunc)(uint32_t v); extern VP8LFastLog2SlowFunc VP8LFastLog2Slow; extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow; -static WEBP_INLINE float VP8LFastLog2(int v) { +static WEBP_INLINE float VP8LFastLog2(uint32_t v) { return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v); } // Fast calculation of v * log2(v) for integer input. -static WEBP_INLINE float VP8LFastSLog2(int v) { +static WEBP_INLINE float VP8LFastSLog2(uint32_t v) { return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v); } // ----------------------------------------------------------------------------- // Huffman-cost related functions. -typedef double (*VP8LCostFunc)(const int* population, int length); -typedef double (*VP8LCostCombinedFunc)(const int* X, const int* Y, int length); +typedef double (*VP8LCostFunc)(const uint32_t* population, int length); +typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, + int length); extern VP8LCostFunc VP8LExtraCost; extern VP8LCostCombinedFunc VP8LExtraCostCombined; @@ -150,13 +153,19 @@ typedef struct { // small struct to hold counters int streaks[2][2]; // [zero/non-zero][streak<3 / streak>=3] } VP8LStreaks; -typedef VP8LStreaks (*VP8LCostCountFunc)(const int* population, int length); -typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const int* X, const int* Y, - int length); +typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population, + int length); +typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X, + const uint32_t* Y, int length); extern VP8LCostCountFunc VP8LHuffmanCostCount; extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount; +typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a, + const VP8LHistogram* const b, + VP8LHistogram* const out); +extern VP8LHistogramAddFunc VP8LHistogramAdd; + // ----------------------------------------------------------------------------- // PrefixEncode() diff --git a/src/dsp/lossless_mips32.c b/src/dsp/lossless_mips32.c index a8d3ca7f..737744cb 100644 --- a/src/dsp/lossless_mips32.c +++ b/src/dsp/lossless_mips32.c @@ -26,13 +26,13 @@ #define APPROX_LOG_MAX 4096 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 -static float FastSLog2Slow(int v) { +static float FastSLog2Slow(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { - int log_cnt, y, correction; + uint32_t log_cnt, y, correction; const int c24 = 24; const float v_f = (float)v; - int temp; + uint32_t temp; // Xf = 256 = 2^8 // log_cnt is index of leading one in upper 24 bits @@ -62,13 +62,13 @@ static float FastSLog2Slow(int v) { } } -static float FastLog2Slow(int v) { +static float FastLog2Slow(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { - int log_cnt, y; + uint32_t log_cnt, y; const int c24 = 24; double log_2; - int temp; + uint32_t temp; __asm__ volatile( "clz %[log_cnt], %[v] \n\t" @@ -86,7 +86,7 @@ static float FastLog2Slow(int v) { // Since the division is still expensive, add this correction factor only // for large values of 'v'. - const int correction = (23 * (v & (y - 1))) >> 4; + const uint32_t correction = (23 * (v & (y - 1))) >> 4; log_2 += (double)correction / v; } return (float)log_2; @@ -98,8 +98,8 @@ static float FastLog2Slow(int v) { // C version of this function: // int i = 0; // int64_t cost = 0; -// int* pop = (int*)&population[4]; -// const int* LoopEnd = (int*)&population[length]; +// const uint32_t* pop = &population[4]; +// const uint32_t* LoopEnd = &population[length]; // while (pop != LoopEnd) { // ++i; // cost += i * *pop; @@ -107,10 +107,10 @@ static float FastLog2Slow(int v) { // pop += 2; // } // return (double)cost; -static double ExtraCost(const int* const population, int length) { +static double ExtraCost(const uint32_t* const population, int length) { int i, temp0, temp1; - const int* pop = &population[4]; - const int* const LoopEnd = &population[length]; + const uint32_t* pop = &population[4]; + const uint32_t* const LoopEnd = &population[length]; __asm__ volatile( "mult $zero, $zero \n\t" @@ -139,12 +139,12 @@ static double ExtraCost(const int* const population, int length) { // C version of this function: // int i = 0; // int64_t cost = 0; -// int* pX = (int*)&X[4]; -// int* pY = (int*)&Y[4]; -// const int* LoopEnd = (int*)&X[length]; +// const uint32_t* pX = &X[4]; +// const uint32_t* pY = &Y[4]; +// const uint32_t* LoopEnd = &X[length]; // while (pX != LoopEnd) { -// const int xy0 = *pX + *pY; -// const int xy1 = *(pX + 1) + *(pY + 1); +// const uint32_t xy0 = *pX + *pY; +// const uint32_t xy1 = *(pX + 1) + *(pY + 1); // ++i; // cost += i * xy0; // cost += i * xy1; @@ -152,12 +152,12 @@ static double ExtraCost(const int* const population, int length) { // pY += 2; // } // return (double)cost; -static double ExtraCostCombined(const int* const X, const int* const Y, - int length) { +static double ExtraCostCombined(const uint32_t* const X, + const uint32_t* const Y, int length) { int i, temp0, temp1, temp2, temp3; - const int* pX = &X[4]; - const int* pY = &Y[4]; - const int* const LoopEnd = &X[length]; + const uint32_t* pX = &X[4]; + const uint32_t* pY = &Y[4]; + const uint32_t* const LoopEnd = &X[length]; __asm__ volatile( "mult $zero, $zero \n\t" @@ -217,7 +217,7 @@ static double ExtraCostCombined(const int* const X, const int* const Y, ); // Returns the various RLE counts -static VP8LStreaks HuffmanCostCount(const int* population, int length) { +static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) { int i; int streak = 0; VP8LStreaks stats; @@ -230,19 +230,19 @@ static VP8LStreaks HuffmanCostCount(const int* population, int length) { if (population[i] == population[i + 1]) { continue; } - temp0 = population[i] != 0; + temp0 = (population[i] != 0); HUFFMAN_COST_PASS streak = 0; } ++streak; - temp0 = population[i] != 0; + temp0 = (population[i] != 0); HUFFMAN_COST_PASS return stats; } -static VP8LStreaks HuffmanCostCombinedCount(const int* X, const int* Y, - int length) { +static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X, + const uint32_t* Y, int length) { int i; int streak = 0; VP8LStreaks stats; @@ -251,20 +251,20 @@ static VP8LStreaks HuffmanCostCombinedCount(const int* X, const int* Y, int temp0, temp1, temp2, temp3; memset(&stats, 0, sizeof(stats)); for (i = 0; i < length - 1; ++i) { - const int xy = X[i] + Y[i]; - const int xy_next = X[i + 1] + Y[i + 1]; + const uint32_t xy = X[i] + Y[i]; + const uint32_t xy_next = X[i + 1] + Y[i + 1]; ++streak; if (xy == xy_next) { continue; } - temp0 = xy != 0; + temp0 = (xy != 0); HUFFMAN_COST_PASS streak = 0; } { - const int xy = X[i] + Y[i]; + const uint32_t xy = X[i] + Y[i]; ++streak; - temp0 = xy != 0; + temp0 = (xy != 0); HUFFMAN_COST_PASS } diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 0e37cf1f..0ac9bebe 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -383,6 +383,88 @@ static void ConvertBGRAToBGR(const uint32_t* src, VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); } +//------------------------------------------------------------------------------ + +#define LINE_SIZE 16 // 8 or 16 +static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out, + int size) { + int i; + assert(size % LINE_SIZE == 0); + for (i = 0; i < size; i += LINE_SIZE) { + const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i + 0]); + const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i + 4]); +#if (LINE_SIZE == 16) + const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i + 8]); + const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]); +#endif + const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i + 0]); + const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i + 4]); +#if (LINE_SIZE == 16) + const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i + 8]); + const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]); +#endif + _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); + _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); +#if (LINE_SIZE == 16) + _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); + _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); +#endif + } +} + +static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) { + int i; + assert(size % LINE_SIZE == 0); + for (i = 0; i < size; i += LINE_SIZE) { + const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i + 0]); + const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i + 4]); +#if (LINE_SIZE == 16) + const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i + 8]); + const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]); +#endif + const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i + 0]); + const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i + 4]); +#if (LINE_SIZE == 16) + const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i + 8]); + const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]); +#endif + _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); + _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); +#if (LINE_SIZE == 16) + _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); + _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); +#endif + } +} +#undef LINE_SIZE + +// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But +// that's ok since the histogram values are less than 1<<28 (max picture size). +static void HistogramAdd(const VP8LHistogram* const a, + const VP8LHistogram* const b, + VP8LHistogram* const out) { + int i; + const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); + assert(a->palette_code_bits_ == b->palette_code_bits_); + if (b != out) { + AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES); + AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES); + AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES); + AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES); + } else { + AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES); + AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES); + AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES); + AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES); + } + for (i = NUM_LITERAL_CODES; i < literal_size; ++i) { + out->literal_[i] = a->literal_[i] + b->literal_[i]; + } + for (i = 0; i < NUM_DISTANCE_CODES; ++i) { + out->distance_[i] = a->distance_[i] + b->distance_[i]; + } +} + #endif // WEBP_USE_SSE2 //------------------------------------------------------------------------------ @@ -405,6 +487,8 @@ void VP8LDspInitSSE2(void) { VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; VP8LConvertBGRAToBGR = ConvertBGRAToBGR; + + VP8LHistogramAdd = HistogramAdd; #endif // WEBP_USE_SSE2 } diff --git a/src/enc/backward_references.c b/src/enc/backward_references.c index 8e489a62..dd00fff9 100644 --- a/src/enc/backward_references.c +++ b/src/enc/backward_references.c @@ -405,8 +405,8 @@ static int BackwardReferencesTraceBackwards( VP8LBackwardRefs* const refs); static void ConvertPopulationCountTableToBitEstimates( - int num_symbols, const int population_counts[], double output[]) { - int sum = 0; + int num_symbols, const uint32_t population_counts[], double output[]) { + uint32_t sum = 0; int nonzeros = 0; int i; for (i = 0; i < num_symbols; ++i) { diff --git a/src/enc/histogram.c b/src/enc/histogram.c index 08f8d6fd..511b71c4 100644 --- a/src/enc/histogram.c +++ b/src/enc/histogram.c @@ -29,7 +29,7 @@ #define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS) static void HistogramClear(VP8LHistogram* const p) { - int* const literal = p->literal_; + uint32_t* const literal = p->literal_; const int cache_bits = p->palette_code_bits_; const uint64_t histo_size = VP8LGetHistogramSize(cache_bits); memset(p, 0, histo_size); @@ -39,7 +39,7 @@ static void HistogramClear(VP8LHistogram* const p) { static void HistogramCopy(const VP8LHistogram* const src, VP8LHistogram* const dst) { - int* const dst_literal = dst->literal_; + uint32_t* const dst_literal = dst->literal_; const int dst_cache_bits = dst->palette_code_bits_; const uint64_t histo_size = VP8LGetHistogramSize(dst_cache_bits); assert(src->palette_code_bits_ == dst_cache_bits); @@ -92,7 +92,7 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) { if (memory == NULL) return NULL; histo = (VP8LHistogram*)memory; // literal_ won't necessary be aligned. - histo->literal_ = (int*)(memory + sizeof(VP8LHistogram)); + histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram)); VP8LHistogramInit(histo, cache_bits); return histo; } @@ -115,7 +115,7 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) { for (i = 0; i < size; ++i) { set->histograms[i] = (VP8LHistogram*)memory; // literal_ won't necessary be aligned. - set->histograms[i]->literal_ = (int*)(memory + sizeof(VP8LHistogram)); + set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram)); VP8LHistogramInit(set->histograms[i], cache_bits); // There's no padding/alignment between successive histograms. memory += VP8LGetHistogramSize(cache_bits); @@ -133,7 +133,7 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo, ++histo->literal_[PixOrCopyLiteral(v, 1)]; ++histo->blue_[PixOrCopyLiteral(v, 0)]; } else if (PixOrCopyIsCacheIdx(v)) { - int literal_ix = + const int literal_ix = NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v); ++histo->literal_[literal_ix]; } else { @@ -178,11 +178,11 @@ static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val, } } -static double BitsEntropy(const int* const array, int n) { +static double BitsEntropy(const uint32_t* const array, int n) { double retval = 0.; - int sum = 0; + uint32_t sum = 0; int nonzeros = 0; - int max_val = 0; + uint32_t max_val = 0; int i; for (i = 0; i < n; ++i) { if (array[i] != 0) { @@ -198,8 +198,8 @@ static double BitsEntropy(const int* const array, int n) { return BitsEntropyRefine(nonzeros, sum, max_val, retval); } -static double BitsEntropyCombined(const int* const X, const int* const Y, - int n) { +static double BitsEntropyCombined(const uint32_t* const X, + const uint32_t* const Y, int n) { double retval = 0.; int sum = 0; int nonzeros = 0; @@ -239,24 +239,24 @@ static double FinalHuffmanCost(const VP8LStreaks* const stats) { } // Trampolines -static double HuffmanCost(const int* const population, int length) { +static double HuffmanCost(const uint32_t* const population, int length) { const VP8LStreaks stats = VP8LHuffmanCostCount(population, length); return FinalHuffmanCost(&stats); } -static double HuffmanCostCombined(const int* const X, const int* const Y, - int length) { +static double HuffmanCostCombined(const uint32_t* const X, + const uint32_t* const Y, int length) { const VP8LStreaks stats = VP8LHuffmanCostCombinedCount(X, Y, length); return FinalHuffmanCost(&stats); } // Aggregated costs -static double PopulationCost(const int* const population, int length) { +static double PopulationCost(const uint32_t* const population, int length) { return BitsEntropy(population, length) + HuffmanCost(population, length); } -static double GetCombinedEntropy(const int* const X, const int* const Y, - int length) { +static double GetCombinedEntropy(const uint32_t* const X, + const uint32_t* const Y, int length) { return BitsEntropyCombined(X, Y, length) + HuffmanCostCombined(X, Y, length); } @@ -286,25 +286,6 @@ double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) { // ----------------------------------------------------------------------------- // Various histogram combine/cost-eval functions -// Adds 'in' histogram to 'out' -static void HistogramAdd(const VP8LHistogram* const in, - VP8LHistogram* const out) { - int i; - const int literal_size = VP8LHistogramNumCodes(in->palette_code_bits_); - assert(in->palette_code_bits_ == out->palette_code_bits_); - for (i = 0; i < literal_size; ++i) { - out->literal_[i] += in->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] += in->distance_[i]; - } - for (i = 0; i < NUM_LITERAL_CODES; ++i) { - out->red_[i] += in->red_[i]; - out->blue_[i] += in->blue_[i]; - out->alpha_[i] += in->alpha_[i]; - } -} - static int GetCombinedHistogramEntropy(const VP8LHistogram* const a, const VP8LHistogram* const b, double cost_threshold, @@ -347,23 +328,10 @@ static double HistogramAddEval(const VP8LHistogram* const a, double cost_threshold) { double cost = 0; const double sum_cost = a->bit_cost_ + b->bit_cost_; - int i; - assert(a->palette_code_bits_ == b->palette_code_bits_); cost_threshold += sum_cost; if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) { - const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); - for (i = 0; i < literal_size; ++i) { - out->literal_[i] = a->literal_[i] + b->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] = a->distance_[i] + b->distance_[i]; - } - for (i = 0; i < NUM_LITERAL_CODES; ++i) { - out->red_[i] = a->red_[i] + b->red_[i]; - out->blue_[i] = a->blue_[i] + b->blue_[i]; - out->alpha_[i] = a->alpha_[i] + b->alpha_[i]; - } + VP8LHistogramAdd(a, b, out); out->bit_cost_ = cost; out->palette_code_bits_ = a->palette_code_bits_; } @@ -697,8 +665,9 @@ static void HistogramRemap(const VP8LHistogramSet* const init_histo, } for (i = 0; i < init_histo->size; ++i) { - HistogramAdd(init_histo->histograms[i], - histo_image->histograms[symbols[i]]); + VP8LHistogramAdd(init_histo->histograms[i], + histo_image->histograms[symbols[i]], + histo_image->histograms[symbols[i]]); } } diff --git a/src/enc/histogram.h b/src/enc/histogram.h index e1b52eba..9a4bb0ec 100644 --- a/src/enc/histogram.h +++ b/src/enc/histogram.h @@ -32,12 +32,12 @@ extern "C" { typedef struct { // literal_ contains green literal, palette-code and // copy-length-prefix histogram - int* literal_; // Pointer to the allocated buffer for literal. - int red_[256]; - int blue_[256]; - int alpha_[256]; + uint32_t* literal_; // Pointer to the allocated buffer for literal. + uint32_t red_[NUM_LITERAL_CODES]; + uint32_t blue_[NUM_LITERAL_CODES]; + uint32_t alpha_[NUM_LITERAL_CODES]; // Backward reference prefix-code histogram. - int distance_[NUM_DISTANCE_CODES]; + uint32_t distance_[NUM_DISTANCE_CODES]; int palette_code_bits_; double bit_cost_; // cached value of VP8LHistogramEstimateBits(this) double literal_cost_; // Cached values of dominant entropy costs: diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c index e0bb4332..76a771d0 100644 --- a/src/enc/vp8l.c +++ b/src/enc/vp8l.c @@ -330,7 +330,7 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw, VP8LWriteBits(bw, 1, 0); num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens); { - int histogram[CODE_LENGTH_CODES] = { 0 }; + uint32_t histogram[CODE_LENGTH_CODES] = { 0 }; uint8_t buf_rle[CODE_LENGTH_CODES] = { 0 }; int i; for (i = 0; i < num_tokens; ++i) { diff --git a/src/utils/huffman_encode.c b/src/utils/huffman_encode.c index 6cb433df..6421c2be 100644 --- a/src/utils/huffman_encode.c +++ b/src/utils/huffman_encode.c @@ -29,7 +29,7 @@ static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) { // Change the population counts in a way that the consequent // Huffman tree compression, especially its RLE-part, give smaller output. static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle, - int* const counts) { + uint32_t* const counts) { // 1) Let's make the Huffman code more compatible with rle encoding. int i; for (; length >= 0; --length) { @@ -47,7 +47,7 @@ static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle, // Let's not spoil any of the existing good rle codes. // Mark any seq of 0's that is longer as 5 as a good_for_rle. // Mark any seq of non-0's that is longer as 7 as a good_for_rle. - int symbol = counts[0]; + uint32_t symbol = counts[0]; int stride = 0; for (i = 0; i < length + 1; ++i) { if (i == length || counts[i] != symbol) { @@ -69,17 +69,17 @@ static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle, } // 3) Let's replace those population counts that lead to more rle codes. { - int stride = 0; - int limit = counts[0]; - int sum = 0; + uint32_t stride = 0; + uint32_t limit = counts[0]; + uint32_t sum = 0; for (i = 0; i < length + 1; ++i) { if (i == length || good_for_rle[i] || (i != 0 && good_for_rle[i - 1]) || !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) { if (stride >= 4 || (stride >= 3 && sum == 0)) { - int k; + uint32_t k; // The stride must end, collapse what we have, if we have enough (4). - int count = (sum + stride / 2) / stride; + uint32_t count = (sum + stride / 2) / stride; if (count < 1) { count = 1; } @@ -162,10 +162,11 @@ static void SetBitDepths(const HuffmanTree* const tree, // we are not planning to use this with extremely long blocks. // // See http://en.wikipedia.org/wiki/Huffman_coding -static void GenerateOptimalTree(const int* const histogram, int histogram_size, +static void GenerateOptimalTree(const uint32_t* const histogram, + int histogram_size, HuffmanTree* tree, int tree_depth_limit, uint8_t* const bit_depths) { - int count_min; + uint32_t count_min; HuffmanTree* tree_pool; int tree_size_orig = 0; int i; @@ -195,7 +196,7 @@ static void GenerateOptimalTree(const int* const histogram, int histogram_size, int j; for (j = 0; j < histogram_size; ++j) { if (histogram[j] != 0) { - const int count = + const uint32_t count = (histogram[j] < count_min) ? count_min : histogram[j]; tree[idx].total_count_ = count; tree[idx].value_ = j; @@ -211,7 +212,7 @@ static void GenerateOptimalTree(const int* const histogram, int histogram_size, if (tree_size > 1) { // Normal case. int tree_pool_size = 0; while (tree_size > 1) { // Finish when we have only one root. - int count; + uint32_t count; tree_pool[tree_pool_size++] = tree[tree_size - 1]; tree_pool[tree_pool_size++] = tree[tree_size - 2]; count = tree_pool[tree_pool_size - 1].total_count_ + @@ -402,7 +403,7 @@ static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) { // ----------------------------------------------------------------------------- // Main entry point -void VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit, +void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit, uint8_t* const buf_rle, HuffmanTree* const huff_tree, HuffmanTreeCode* const huff_code) { diff --git a/src/utils/huffman_encode.h b/src/utils/huffman_encode.h index 44e09128..91aa18f4 100644 --- a/src/utils/huffman_encode.h +++ b/src/utils/huffman_encode.h @@ -36,7 +36,7 @@ typedef struct { // Struct to represent the Huffman tree. // TODO(vikasa): Add comment for the fields of the Struct. typedef struct { - int total_count_; + uint32_t total_count_; int value_; int pool_index_left_; // Index for the left sub-tree. int pool_index_right_; // Index for the right sub-tree. @@ -50,7 +50,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree, // Create an optimized tree, and tokenize it. // 'buf_rle' and 'huff_tree' are pre-allocated and the 'tree' is the constructed // huffman code tree. -void VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit, +void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit, uint8_t* const buf_rle, HuffmanTree* const huff_tree, HuffmanTreeCode* const tree);