webvttparser: added LineReader class

Previously the Parser class had an internal function to parse the character stream into separate lines. This functionality was separated out into its own class, LineReader, in order to make this functionality available to clients too. Change-Id: Ic5a1b0b73d7a253cf21cb6b4804b4941fd69c8ab
2012-09-28 15:32:40 -07:00
parent 4f494f6dd4
commit 49078292b4
2 changed files with 224 additions and 214 deletions
--- a/webvttparser.cc
+++ b/webvttparser.cc
@@ -21,212 +21,13 @@ enum {
  kCR = '\x0D'
 };

-Reader::Reader() {
-}
-
 Reader::~Reader() {
 }

-Parser::Parser(Reader* r) : reader_(r), unget_(-1) {
+LineReader::~LineReader() {
 }

-int Parser::Init() {
-  int e = ParseBOM();
-
-  if (e < 0)  // error
-    return e;
-
-  if (e > 0)  // EOF
-    return -1;
-
-  // Parse "WEBVTT".  We read from the stream one character at-a-time, in
-  // order to defend against non-WebVTT streams (e.g. binary files) that don't
-  // happen to comprise lines of text demarcated with line terminators.
-
-  const char kId[] = "WEBVTT";
-
-  for (const char* p = kId; *p; ++p) {
-    char c;
-    e = GetChar(&c);
-
-    if (e < 0)  // error
-      return e;
-
-    if (e > 0)  // EOF
-      return -1;
-
-    if (c != *p)
-      return -1;
-  }
-
-  string line;
-
-  e = ParseLine(&line);
-
-  if (e < 0)  // error
-    return e;
-
-  if (e > 0)  // EOF
-    return 0;  // weird but valid
-
-  if (!line.empty()) {
-    // Parse optional characters that follow "WEBVTT"
-
-    const char c = line[0];
-
-    if (c != kSPACE && c != kTAB)
-      return -1;
-  }
-
-  // The WebVTT spec requires that the "WEBVTT" line
-  // be followed by an empty line (to separate it from
-  // first cue).
-
-  e = ParseLine(&line);
-
-  if (e < 0)  // error
-    return e;
-
-  if (e > 0)  // EOF
-    return 0;  // weird but we allow it
-
-  if (!line.empty())
-    return -1;
-
-  return 0;  // success
-}
-
-int Parser::Parse(Cue* cue) {
-  if (cue == NULL)
-    return -1;
-
-  // Parse first non-blank line
-
-  string line;
-  int e;
-
-  for (;;) {
-    e = ParseLine(&line);
-
-    if (e)  // EOF is OK here
-      return e;
-
-    if (!line.empty())
-      break;
-  }
-
-  // A WebVTT cue comprises an optional cue identifier line followed
-  // by a (non-optional) timings line.  You determine whether you have
-  // a timings line by scanning for the arrow token, the lexeme of which
-  // may not appear in the cue identifier line.
-
-  const char kArrow[] = "-->";
-  string::size_type arrow_pos = line.find(kArrow);
-
-  if (arrow_pos != string::npos) {
-    // We found a timings line, which implies that we don't have a cue
-    // identifier.
-
-    cue->identifier.clear();
-  } else {
-    // We did not find a timings line, so we assume that we have a cue
-    // identifier line, and then try again to find the cue timings on
-    // the next line.
-
-    cue->identifier.swap(line);
-
-    e = ParseLine(&line);
-
-    if (e < 0)  // error
-      return e;
-
-    if (e > 0)  // EOF
-      return -1;
-
-    arrow_pos = line.find(kArrow);
-
-    if (arrow_pos == string::npos)  // not a timings line
-      return -1;
-  }
-
-  e = ParseTimingsLine(&line,
-                       arrow_pos,
-                       &cue->start_time,
-                       &cue->stop_time,
-                       &cue->settings);
-
-  if (e)  // error
-    return e;
-
-  // The cue payload comprises all the non-empty
-  // lines that follow the timings line.
-
-  Cue::payload_t& p = cue->payload;
-  p.clear();
-
-  for (;;) {
-    e = ParseLine(&line);
-
-    if (e < 0)  // error
-      return e;
-
-    if (line.empty())
-      break;
-
-    p.push_back(line);
-  }
-
-  if (p.empty())
-    return -1;
-
-  return 0;  // success
-}
-
-int Parser::GetChar(char* c) {
-  if (unget_ >= 0) {
-    *c = static_cast<char>(unget_);
-    unget_ = -1;
-    return 0;
-  }
-
-  return reader_->GetChar(c);
-}
-
-void Parser::UngetChar(char c) {
-  unget_ = static_cast<unsigned char>(c);
-}
-
-int Parser::ParseBOM() {
-  // Explanation of UTF-8 BOM:
-  // http://en.wikipedia.org/wiki/Byte_order_mark
-
-  static const char BOM[] = "\xEF\xBB\xBF";  // UTF-8 BOM
-
-  for (int i = 0; i < 3; ++i) {
-    char c;
-    int e = GetChar(&c);
-
-    if (e < 0)  // error
-      return e;
-
-    if (e > 0)  // EOF
-      return 1;
-
-    if (c != BOM[i]) {
-      if (i == 0) {  // we don't have a BOM
-        UngetChar(c);
-        return 0;  // success
-      }
-
-      // We started a BOM, so we must finish the BOM.
-      return -1;  // error
-    }
-  }
-
-  return 0;  // success
-}
-
-int Parser::ParseLine(string* line_ptr) {
+int LineReader::GetLine(string* line_ptr) {
  if (line_ptr == NULL)
    return -1;

@@ -299,6 +100,208 @@ int Parser::ParseLine(string* line_ptr) {
  return 0;
 }

+Parser::Parser(Reader* r) : reader_(r), unget_(-1) {
+}
+
+Parser::~Parser() {
+}
+
+int Parser::Init() {
+  int e = ParseBOM();
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return -1;
+
+  // Parse "WEBVTT".  We read from the stream one character at-a-time, in
+  // order to defend against non-WebVTT streams (e.g. binary files) that don't
+  // happen to comprise lines of text demarcated with line terminators.
+
+  const char kId[] = "WEBVTT";
+
+  for (const char* p = kId; *p; ++p) {
+    char c;
+    e = GetChar(&c);
+
+    if (e < 0)  // error
+      return e;
+
+    if (e > 0)  // EOF
+      return -1;
+
+    if (c != *p)
+      return -1;
+  }
+
+  string line;
+
+  e = GetLine(&line);
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return 0;  // weird but valid
+
+  if (!line.empty()) {
+    // Parse optional characters that follow "WEBVTT"
+
+    const char c = line[0];
+
+    if (c != kSPACE && c != kTAB)
+      return -1;
+  }
+
+  // The WebVTT spec requires that the "WEBVTT" line
+  // be followed by an empty line (to separate it from
+  // first cue).
+
+  e = GetLine(&line);
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return 0;  // weird but we allow it
+
+  if (!line.empty())
+    return -1;
+
+  return 0;  // success
+}
+
+int Parser::Parse(Cue* cue) {
+  if (cue == NULL)
+    return -1;
+
+  // Parse first non-blank line
+
+  string line;
+  int e;
+
+  for (;;) {
+    e = GetLine(&line);
+
+    if (e)  // EOF is OK here
+      return e;
+
+    if (!line.empty())
+      break;
+  }
+
+  // A WebVTT cue comprises an optional cue identifier line followed
+  // by a (non-optional) timings line.  You determine whether you have
+  // a timings line by scanning for the arrow token, the lexeme of which
+  // may not appear in the cue identifier line.
+
+  const char kArrow[] = "-->";
+  string::size_type arrow_pos = line.find(kArrow);
+
+  if (arrow_pos != string::npos) {
+    // We found a timings line, which implies that we don't have a cue
+    // identifier.
+
+    cue->identifier.clear();
+  } else {
+    // We did not find a timings line, so we assume that we have a cue
+    // identifier line, and then try again to find the cue timings on
+    // the next line.
+
+    cue->identifier.swap(line);
+
+    e = GetLine(&line);
+
+    if (e < 0)  // error
+      return e;
+
+    if (e > 0)  // EOF
+      return -1;
+
+    arrow_pos = line.find(kArrow);
+
+    if (arrow_pos == string::npos)  // not a timings line
+      return -1;
+  }
+
+  e = ParseTimingsLine(&line,
+                       arrow_pos,
+                       &cue->start_time,
+                       &cue->stop_time,
+                       &cue->settings);
+
+  if (e)  // error
+    return e;
+
+  // The cue payload comprises all the non-empty
+  // lines that follow the timings line.
+
+  Cue::payload_t& p = cue->payload;
+  p.clear();
+
+  for (;;) {
+    e = GetLine(&line);
+
+    if (e < 0)  // error
+      return e;
+
+    if (line.empty())
+      break;
+
+    p.push_back(line);
+  }
+
+  if (p.empty())
+    return -1;
+
+  return 0;  // success
+}
+
+int Parser::GetChar(char* c) {
+  if (unget_ >= 0) {
+    *c = static_cast<char>(unget_);
+    unget_ = -1;
+    return 0;
+  }
+
+  return reader_->GetChar(c);
+}
+
+void Parser::UngetChar(char c) {
+  unget_ = static_cast<unsigned char>(c);
+}
+
+int Parser::ParseBOM() {
+  // Explanation of UTF-8 BOM:
+  // http://en.wikipedia.org/wiki/Byte_order_mark
+
+  static const char BOM[] = "\xEF\xBB\xBF";  // UTF-8 BOM
+
+  for (int i = 0; i < 3; ++i) {
+    char c;
+    int e = GetChar(&c);
+
+    if (e < 0)  // error
+      return e;
+
+    if (e > 0)  // EOF
+      return 1;
+
+    if (c != BOM[i]) {
+      if (i == 0) {  // we don't have a BOM
+        UngetChar(c);
+        return 0;  // success
+      }
+
+      // We started a BOM, so we must finish the BOM.
+      return -1;  // error
+    }
+  }
+
+  return 0;  // success
+}
+
 int Parser::ParseTimingsLine(
    string* line_ptr,
    string::size_type arrow_pos,
--- a/webvttparser.h
+++ b/webvttparser.h
@@ -19,14 +19,26 @@ class Reader {
  // Fetch a character from the stream. Return
  // negative if error, positive if end-of-stream,
  // and 0 if a character is available.
-  //
  virtual int GetChar(char* c) = 0;

 protected:
-  Reader();
  virtual ~Reader();
 };

+class LineReader : protected Reader {
+ public:
+  // Consume a line of text from the stream, stripping off
+  // the line terminator characters.  Returns negative if error,
+  // 0 on success, and positive at end-of-stream.
+  int GetLine(std::string* line);
+
+ protected:
+  virtual ~LineReader();
+
+  // Puts a character back into the stream.
+  virtual void UngetChar(char c) = 0;
+};
+
 // As measured in thousandths of a second,
 // e.g. a duration of 1 equals 0.001 seconds,
 // and a duration of 1000 equals 1 second.
@@ -72,9 +84,10 @@ struct Cue {
  payload_t payload;
 };

-class Parser {
+class Parser : private LineReader {
 public:
  explicit Parser(Reader* r);
+  virtual ~Parser();

  // Pre-parse enough of the stream to determine whether
  // this is really a WEBVTT file. Returns 0 on success,
@@ -88,22 +101,16 @@ class Parser {

 private:
  // Returns the next character in the stream, using the look-back character
-  // if present.
-  int GetChar(char* c);
+  // if present (as per Reader::GetChar).
+  virtual int GetChar(char* c);

-  // Puts a character back into the stream.
-  void UngetChar(char c);
+  // Puts a character back into the stream (as per LineReader::UngetChar).
+  virtual void UngetChar(char c);

  // Check for presence of a UTF-8 BOM in the stream.  Returns
  // negative if error, 0 on success, and positive at end-of-stream.
  int ParseBOM();

-  // Consume a line of text from the stream, stripping off
-  // the line terminator characters.  Returns negative if error,
-  // 0 on success, and positive at end-of-stream.
-  //
-  int ParseLine(std::string* line);
-
  // Parse the distinguished "cue timings" line, which includes the start
  // and stop times and settings.  Argument |line| contains the complete
  // line of text (as returned by ParseLine()), which the function is free