VP8LBitWriter: use a bit-accumulator

* simplify the endian logic * remove the need for memset() * write 16 or 32 at a time (likely aligned) Makes the code a bit faster on ARM (~1%) Change-Id: I650bc5654e8d0b0454318b7a78206b301c5f6c2c
2014-02-11 09:12:45 -08:00
parent 3f40b4a581
commit bf182e837e
2 changed files with 92 additions and 70 deletions
--- a/src/utils/bit_writer.c
+++ b/src/utils/bit_writer.c
@@ -194,19 +194,54 @@ void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
 //------------------------------------------------------------------------------
 // VP8LBitWriter

+// This is the minimum amount of size the memory buffer is guaranteed to grow
+// when extra space is needed.
+#define MIN_EXTRA_SIZE  (32768ULL)
+
+#define VP8L_WRITER_BYTES ((int)sizeof(vp8l_wtype_t))
+#define VP8L_WRITER_BITS (VP8L_WRITER_BYTES * 8)
+
+//  endian-specific htoleXX() definition
+// TODO(skal): move this to config.h, and collect all the endian-related code
+// in a proper .h file
+#if defined(_WIN32)
+#if !defined(_M_PPC)
+#define htole32(x) (x)
+#define htole16(x) (x)
+#else     // PPC is BIG_ENDIAN
+#include <stdlib.h>
+#define htole32(x) (_byteswap_ulong((unsigned long)(x)))
+#define htole16(x) (_byteswap_ushort((unsigned short)(x)))
+#endif    // _M_PPC
+#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \
+      defined(__DragonFly__)
+#include <sys/endian.h>
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#define htole32 OSSwapHostToLittleInt32
+#define htole16 OSSwapHostToLittleInt16
+#elif defined(__native_client__) && !defined(__GLIBC__)
+// NaCl without glibc is assumed to be little-endian
+#define htole32(x) (x)
+#define htole16(x) (x)
+#else     // pretty much all linux and/or glibc
+#include <endian.h>
+#endif
+
 // Returns 1 on success.
 static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
  uint8_t* allocated_buf;
  size_t allocated_size;
-  const size_t current_size = VP8LBitWriterNumBytes(bw);
+  const size_t max_bytes = bw->end_ - bw->buf_;
+  const size_t current_size = bw->cur_ - bw->buf_;
  const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
  const size_t size_required = (size_t)size_required_64b;
  if (size_required != size_required_64b) {
    bw->error_ = 1;
    return 0;
  }
-  if (bw->max_bytes_ > 0 && size_required <= bw->max_bytes_) return 1;
-  allocated_size = (3 * bw->max_bytes_) >> 1;
+  if (max_bytes > 0 && size_required <= max_bytes) return 1;
+  allocated_size = (3 * max_bytes) >> 1;
  if (allocated_size < size_required) allocated_size = size_required;
  // make allocated size multiple of 1k
  allocated_size = (((allocated_size >> 10) + 1) << 10);
@@ -215,11 +250,13 @@ static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
    bw->error_ = 1;
    return 0;
  }
-  memcpy(allocated_buf, bw->buf_, current_size);
+  if (current_size > 0) {
+    memcpy(allocated_buf, bw->buf_, current_size);
+  }
  free(bw->buf_);
  bw->buf_ = allocated_buf;
-  bw->max_bytes_ = allocated_size;
-  memset(allocated_buf + current_size, 0, allocated_size - current_size);
+  bw->cur_ = bw->buf_ + current_size;
+  bw->end_ = bw->buf_ + allocated_size;
  return 1;
 }

@@ -236,46 +273,37 @@ void VP8LBitWriterDestroy(VP8LBitWriter* const bw) {
 }

 void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
-  if (n_bits < 1) return;
-#if !defined(__BIG_ENDIAN__)
-  // Technically, this branch of the code can write up to 25 bits at a time,
-  // but in prefix encoding, the maximum number of bits written is 18 at a time.
-  {
-    uint8_t* const p = &bw->buf_[bw->bit_pos_ >> 3];
-    uint32_t v = *(const uint32_t*)p;
-    v |= bits << (bw->bit_pos_ & 7);
-    *(uint32_t*)p = v;
-    bw->bit_pos_ += n_bits;
-  }
-#else  // BIG_ENDIAN
-  {
-    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
-    const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
-    const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
-    // implicit & 0xff is assumed for uint8_t arithmetic
-    *p++ |= bits << bits_reserved_in_first_byte;
-    bits >>= 8 - bits_reserved_in_first_byte;
-    if (bits_left_to_write >= 1) {
-      *p++ = bits;
-      bits >>= 8;
-      if (bits_left_to_write >= 9) {
-        *p++ = bits;
-        bits >>= 8;
+  if (n_bits <= 0) return;
+  bw->bits_ |= (vp8l_atype_t)bits << bw->used_;
+  bw->used_ += n_bits;
+  if (bw->used_ > VP8L_WRITER_BITS) {
+    if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+      const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+      if (extra_size != (size_t)extra_size ||
+          !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+        bw->cur_ = bw->buf_;
+        bw->error_ = 1;
+        return;
      }
    }
-    assert(n_bits <= 25);
-    *p = bits;
-    bw->bit_pos_ += n_bits;
+    *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)bw->bits_);
+    bw->cur_ += VP8L_WRITER_BYTES;
+    bw->bits_ >>= VP8L_WRITER_BITS;
+    bw->used_ -= VP8L_WRITER_BITS;
  }
-#endif
-  if ((bw->bit_pos_ >> 3) > (bw->max_bytes_ - 8)) {
-    const uint64_t extra_size = 32768ULL + bw->max_bytes_;
-    if (extra_size != (size_t)extra_size ||
-        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
-      bw->bit_pos_ = 0;
-      bw->error_ = 1;
+}
+
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  // flush leftover bits
+  if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
+    while (bw->used_ > 0) {
+      *bw->cur_++ = bw->bits_;   // & 0xff is implied here
+      bw->bits_ >>= 8;
+      bw->used_ -= 8;
    }
+    bw->used_ = 0;
  }
+  return bw->buf_;
 }

 //------------------------------------------------------------------------------
--- a/src/utils/bit_writer.h
+++ b/src/utils/bit_writer.h
@@ -68,51 +68,45 @@ static WEBP_INLINE size_t VP8BitWriterSize(const VP8BitWriter* const bw) {

 //------------------------------------------------------------------------------
 // VP8LBitWriter
-// TODO(vikasa): VP8LBitWriter is copied as-is from lossless code. There's scope
-// of re-using VP8BitWriter. Will evaluate once basic lossless encoder is
-// implemented.
+
+#if defined(__x86_64__) || defined(_M_X64)   // 64bit
+typedef uint64_t vp8l_atype_t;   // accumulator type
+typedef uint32_t vp8l_wtype_t;   // writing type
+#define WSWAP htole32
+#else
+typedef uint32_t vp8l_atype_t;
+typedef uint16_t vp8l_wtype_t;
+#define WSWAP htole16
+#endif

 typedef struct {
-  uint8_t* buf_;
-  size_t bit_pos_;
-  size_t max_bytes_;
+  vp8l_atype_t bits_;   // bit accumulator
+  int          used_;   // number of bits used in accumulator
+  uint8_t*     buf_;    // start of buffer
+  uint8_t*     cur_;    // current write position
+  uint8_t*     end_;    // end of buffer

-  // After all bits are written, the caller must observe the state of
-  // error_. A value of 1 indicates that a memory allocation failure
-  // has happened during bit writing. A value of 0 indicates successful
+  // After all bits are written (VP8LBitWriterFinish()), the caller must observe
+  // the state of error_. A value of 1 indicates that a memory allocation
+  // failure has happened during bit writing. A value of 0 indicates successful
  // writing of bits.
  int error_;
 } VP8LBitWriter;

 static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
-  return (bw->bit_pos_ + 7) >> 3;
+  return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }

-static WEBP_INLINE uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
-  return bw->buf_;
-}
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);

 // Returns 0 in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);

 void VP8LBitWriterDestroy(VP8LBitWriter* const bw);

-// This function writes bits into bytes in increasing addresses, and within
-// a byte least-significant-bit first.
-//
-// The function can write up to 16 bits in one go with WriteBits
-// Example: let's assume that 3 bits (Rs below) have been written already:
-//
-// BYTE-0     BYTE+1       BYTE+2
-//
-// 0000 0RRR    0000 0000    0000 0000
-//
-// Now, we could write 5 or less bits in MSB by just sifting by 3
-// and OR'ing to BYTE-0.
-//
-// For n bits, we take the last 5 bytes, OR that with high bits in BYTE-0,
-// and locate the rest in BYTE+1 and BYTE+2.
-//
+// This function writes bits into bytes in increasing addresses (little endian),
+// and within a byte least-significant-bit first.
+// The function can write up to 8*sizeof(vp8l_wtype_t) bits in one go.
 // VP8LBitWriter's error_ flag is set in case of  memory allocation error.
 void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);