Remove ReadOneBit() and ReadSymbolUnsafe()

Simplify and re-organize the VP8L bit-reader functions
(e.g.: the 40-bit look-ahead code was helping much)

Speed-up with LBITS=64, on arm7-a:

=> before:
./dwebp_justify_24_neon -v bryce_ll.webp
Time to decode picture: 11.393s
File bryce_ll.webp can be decoded (dimensions: 11158 x 2156).
...

=> after (LBITS=64):	Time to decode picture: 9.953s

making the VP8L bit-reader in 32 bit mode is going to be
harder (because we need to be able to read two symbols
at a time, each with max length 15 bits)

Change-Id: I89746fb103b87b5e2fd40a3208a6fbc584b88297
This commit is contained in:
skal 2013-02-20 00:13:23 +01:00
parent b7490f8553
commit 1667bded67
3 changed files with 53 additions and 93 deletions

View File

@ -149,29 +149,21 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
//------------------------------------------------------------------------------
// Decodes the next Huffman code from bit-stream.
// FillBitWindow(br) needs to be called at minimum every second call
// to ReadSymbolUnsafe.
static int ReadSymbolUnsafe(const HuffmanTree* tree, VP8LBitReader* const br) {
const HuffmanTreeNode* node = tree->root_;
assert(node != NULL);
while (!HuffmanTreeNodeIsLeaf(node)) {
node = HuffmanTreeNextNode(node, VP8LReadOneBitUnsafe(br));
}
return node->symbol_;
}
// to ReadSymbol, in order to pre-fetch enough bits.
static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
VP8LBitReader* const br) {
const int read_safe = (br->pos_ + 8 > br->len_);
if (!read_safe) {
return ReadSymbolUnsafe(tree, br);
} else {
const HuffmanTreeNode* node = tree->root_;
assert(node != NULL);
while (!HuffmanTreeNodeIsLeaf(node)) {
node = HuffmanTreeNextNode(node, VP8LReadOneBit(br));
}
return node->symbol_;
const HuffmanTreeNode* node = tree->root_;
int num_bits = 0;
uint32_t bits;
bits = VP8LPrefetchBits(br);
assert(node != NULL);
while (!HuffmanTreeNodeIsLeaf(node)) {
node = HuffmanTreeNextNode(node, bits & 1);
bits >>= 1;
++num_bits;
}
VP8LDiscardBits(br, num_bits);
return node->symbol_;
}
static int ReadHuffmanCodeLengths(

View File

@ -113,6 +113,10 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
#define MAX_NUM_BIT_READ 25
#define LBITS 64 // Number of bits prefetched.
#define WBITS 32 // Minimum number of bytes needed after VP8LFillBitWindow.
#define LOG8_WBITS 4 // Number of bytes needed to store WBITS bits.
static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
@ -134,7 +138,7 @@ void VP8LInitBitReader(VP8LBitReader* const br,
br->eos_ = 0;
br->error_ = 0;
for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (8 * i);
++br->pos_;
}
}
@ -149,91 +153,56 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
br->len_ = len;
}
// If not at EOS, reload up to LBITS byte-by-byte
static void ShiftBytes(VP8LBitReader* const br) {
while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
br->val_ >>= 8;
br->val_ |= ((uint64_t)br->buf_[br->pos_]) << 56;
br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (LBITS - 8);
++br->pos_;
br->bit_pos_ -= 8;
}
}
void VP8LFillBitWindow(VP8LBitReader* const br) {
if (br->bit_pos_ >= 32) {
#if defined(__x86_64__) || defined(_M_X64)
if (br->pos_ + 8 < br->len_) {
br->val_ >>= 32;
if (br->bit_pos_ >= WBITS) {
#if (defined(__x86_64__) || defined(_M_X64))
if (br->pos_ + sizeof(br->val_) < br->len_) {
br->val_ >>= WBITS;
br->bit_pos_ -= WBITS;
// The expression below needs a little-endian arch to work correctly.
// This gives a large speedup for decoding speed.
br->val_ |= *(const uint64_t *)(br->buf_ + br->pos_) << 32;
br->pos_ += 4;
br->bit_pos_ -= 32;
} else {
// Slow path.
ShiftBytes(br);
br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
br->pos_ += LOG8_WBITS;
return;
}
#else
// Always the slow path.
ShiftBytes(br);
#endif
}
if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
br->eos_ = 1;
}
}
uint32_t VP8LReadOneBit(VP8LBitReader* const br) {
const uint32_t val = (uint32_t)((br->val_ >> br->bit_pos_) & 1);
// Flag an error at end_of_stream.
if (!br->eos_) {
++br->bit_pos_;
if (br->bit_pos_ >= 32) {
ShiftBytes(br);
}
// After this last bit is read, check if eos needs to be flagged.
if (br->pos_ == br->len_ && br->bit_pos_ == 64) {
ShiftBytes(br); // Slow path.
if (br->pos_ == br->len_ && br->bit_pos_ == LBITS) {
br->eos_ = 1;
}
} else {
br->error_ = 1;
}
return val;
}
uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
uint32_t val = 0;
assert(n_bits >= 0);
// Flag an error if end_of_stream or n_bits is more than allowed limit.
if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) {
const uint32_t val =
(uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
const int new_bits = br->bit_pos_ + n_bits;
br->bit_pos_ = new_bits;
// If this read is going to cross the read buffer, set the eos flag.
if (br->pos_ == br->len_) {
if ((br->bit_pos_ + n_bits) >= 64) {
if (new_bits >= LBITS) {
br->eos_ = 1;
if ((br->bit_pos_ + n_bits) > 64) return val;
}
}
val = (uint32_t)((br->val_ >> br->bit_pos_) & kBitMask[n_bits]);
br->bit_pos_ += n_bits;
if (br->bit_pos_ >= 40) {
if (br->pos_ + 5 < br->len_) {
br->val_ >>= 40;
br->val_ |=
(((uint64_t)br->buf_[br->pos_ + 0]) << 24) |
(((uint64_t)br->buf_[br->pos_ + 1]) << 32) |
(((uint64_t)br->buf_[br->pos_ + 2]) << 40) |
(((uint64_t)br->buf_[br->pos_ + 3]) << 48) |
(((uint64_t)br->buf_[br->pos_ + 4]) << 56);
br->pos_ += 5;
br->bit_pos_ -= 40;
}
if (br->bit_pos_ >= 8) {
ShiftBytes(br);
}
}
ShiftBytes(br);
return val;
} else {
br->error_ = 1;
return 0;
}
return val;
}
//------------------------------------------------------------------------------

View File

@ -258,14 +258,16 @@ static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
// -----------------------------------------------------------------------------
// Bitreader for lossless format
typedef uint64_t vp8l_val_t; // right now, this bit-reader can only use 64bit.
typedef struct {
uint64_t val_;
const uint8_t* buf_;
size_t len_;
size_t pos_;
int bit_pos_;
int eos_;
int error_;
vp8l_val_t val_; // pre-fetched bits
const uint8_t* buf_; // input byte buffer
size_t len_; // buffer length
size_t pos_; // byte position in buf_
int bit_pos_; // current bit-reading position in val_
int eos_; // bitstream is finished
int error_; // an error occurred (buffer overflow attempt...)
} VP8LBitReader;
void VP8LInitBitReader(VP8LBitReader* const br,
@ -281,17 +283,14 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
// Flags eos if this read attempt is going to cross the read buffer.
uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
// Reads one bit from Read Buffer. Flags an error in case end_of_stream.
// Flags eos after reading last bit from the buffer.
uint32_t VP8LReadOneBit(VP8LBitReader* const br);
// Return the prefetched bits, so they can be looked up.
static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
return (uint32_t)(br->val_ >> br->bit_pos_);
}
// VP8LReadOneBitUnsafe is faster than VP8LReadOneBit, but it can be called only
// 32 times after the last VP8LFillBitWindow. Any subsequent calls
// (without VP8LFillBitWindow) will return invalid data.
static WEBP_INLINE uint32_t VP8LReadOneBitUnsafe(VP8LBitReader* const br) {
const uint32_t val = (uint32_t)((br->val_ >> br->bit_pos_) & 1);
++br->bit_pos_;
return val;
// Discard 'num_bits' bits from the cache.
static WEBP_INLINE void VP8LDiscardBits(VP8LBitReader* const br, int num_bits) {
br->bit_pos_ += num_bits;
}
// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.