added webvtt parser

Change-Id: Icef6d484e8fe6e2c63dc69ab02b6ab37ffcabbd8
2012-07-25 15:15:33 -07:00 · 2012-07-25 15:15:33 -07:00 · adebb53754
commit adebb53754
parent a320f5be63
2 changed files with 830 additions and 0 deletions
--- a/webvttparser.cc
+++ b/webvttparser.cc
@ -0,0 +1,671 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "webvttparser.h"  // NOLINT
+#include <climits>
+
+using std::string;
+
+namespace libwebvtt {
+
+enum {
+  kNUL = '\x00',
+  kSPACE = ' ',
+  kTAB = '\x09',
+  kLF = '\x0A',
+  kCR = '\x0D'
+};
+
+Reader::Reader() {
+}
+
+Reader::~Reader() {
+}
+
+Parser::Parser(Reader* r)
+  : reader_(r), unget_(-1) {
+}
+
+int Parser::Init() {
+  int e = ParseBOM();
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return -1;
+
+  // Parse "WEBVTT".  We read from the stream one character at-a-time, in
+  // order to defend against non-WebVTT streams (e.g. binary files) that don't
+  // happen to comprise lines of text demarcated with line terminators.
+
+  const char idstr[] = "WEBVTT";
+  const char* p = idstr;
+
+  while (*p) {
+    char c;
+    e = GetChar(&c);
+
+    if (e < 0)  // error
+      return e;
+
+    if (e > 0)  // EOF
+      return -1;
+
+    if (c != *p)
+      return -1;
+
+    ++p;
+  }
+
+  string line;
+
+  e = ParseLine(&line);
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return 0;  // weird but valid
+
+  if (!line.empty()) {
+    // Parse optional characters that follow "WEBVTT"
+
+    const char c = line[0];
+
+    if (c != kSPACE && c != kTAB)
+      return -1;
+  }
+
+  // The WebVTT spec requires that the "WEBVTT" line
+  // be followed by an empty line (to separate it from
+  // first cue).
+
+  e = ParseLine(&line);
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return 0;  // weird but we allow it
+
+  if (!line.empty())
+    return -1;
+
+  return 0;  // success
+}
+
+int Parser::Parse(Cue* cue) {
+  if (cue == NULL)
+    return -1;
+
+  // Parse first non-blank line
+
+  string line;
+  int e;
+
+  for (;;) {
+    e = ParseLine(&line);
+
+    if (e)
+      return e;
+
+    if (!line.empty())
+      break;
+  }
+
+  // A WebVTT cue comprises an optional cue identifier line followed
+  // by a (non-optional) timings line.  You determine whether you have
+  // a timings line by scanning for the arrow token, the lexeme of which
+  // may not appear in the cue identifier line.
+
+  string::size_type off = line.find("-->");
+
+  if (off != string::npos) {  // timings line
+    cue->identifier.clear();
+  } else {
+    cue->identifier.swap(line);
+
+    e = ParseLine(&line);
+
+    if (e)
+      return e;
+
+    off = line.find("-->");
+
+    if (off == string::npos)  // not a timings line
+      return -1;
+  }
+
+  e = ParseTimingsLine(line,
+                       off,
+                       &cue->start_time,
+                       &cue->stop_time,
+                       &cue->settings);
+
+  if (e)
+    return e;
+
+  // The cue payload comprises all the non-empty
+  // lines that follow the timings line.
+
+  Cue::payload_t& p = cue->payload;
+  p.clear();
+
+  for (;;) {
+    e = ParseLine(&line);
+
+    if (e < 0)  // error
+      return e;
+
+    if (line.empty())
+      break;
+
+    p.push_back(line);
+  }
+
+  if (p.empty())
+    return -1;
+
+  return 0;  // success
+}
+
+int Parser::GetChar(char* c) {
+  if (unget_ >= 0) {
+    *c = static_cast<char>(unget_);
+    unget_ = -1;
+    return 0;
+  }
+
+  return reader_->GetChar(c);
+}
+
+void Parser::UngetChar(char c) {
+  unget_ = static_cast<unsigned char>(c);
+}
+
+int Parser::ParseBOM() {
+  // Explanation of UTF-8 BOM:
+  // http://en.wikipedia.org/wiki/Byte_order_mark
+
+  static const char BOM[] = "\xEF\xBB\xBF";  // UTF-8 BOM
+
+  for (int i = 0; i < 3; ++i) {
+    char c;
+    int e = GetChar(&c);
+
+    if (e < 0)  // error
+      return e;
+
+    if (e > 0)  // EOF
+      return 1;
+
+    if (c != BOM[i]) {
+      if (i == 0) {  // we don't have a BOM
+        UngetChar(c);
+        return 0;  // success
+      }
+
+      // We started a BOM, so we must finish the BOM.
+      return -1;  // error
+    }
+  }
+
+  return 0;  // success
+}
+
+int Parser::ParseLineTerminator(char c) {
+  // The WebVTT spec states that lines may be
+  // terminated in any of these three ways:
+  //  LF
+  //  CR
+  //  CR LF
+
+  if (c == kLF)
+    return 0;  // success
+
+  if (c != kCR)
+    return -1;  // error
+
+  // We detected a CR.  We must interrogate the next character
+  // in the stream, to determine whether we have a LF.
+
+  int e = GetChar(&c);
+
+  if (e < 0)  // error
+    return e;
+
+  if (e > 0)  // EOF
+    return 0;  // success
+
+  if (c == kLF)
+    return 0;  // success
+
+  // The next character in the stream is not a LF, so
+  // return it to the stream; this completes this line.
+
+  UngetChar(c);
+  return 0;  // success
+}
+
+int Parser::ParseLine(string* line) {
+  line->clear();
+
+  for (;;) {
+    char c;
+    int e = GetChar(&c);
+
+    if (e < 0)  // error
+      return e;
+
+    if (e > 0)  // EOF
+      return (line->empty()) ? 1 : 0;
+
+    if (c == kLF || c == kCR) {
+      e = ParseLineTerminator(c);
+
+      if (e < 0)  // error
+        return e;
+
+      return 0;
+    }
+
+    line->push_back(c);
+  }
+}
+
+int Parser::ParseTimingsLine(
+  string& line,
+  string::size_type arrow_pos,
+  Time* start_time,
+  Time* stop_time,
+  Cue::settings_t* settings) {
+  //
+  // Place a NUL character at the start of the arrow token, in
+  // order to demarcate the start time from remainder of line.
+
+  if (arrow_pos == string::npos || arrow_pos >= line.length())
+    return -1;
+
+  line[arrow_pos] = kNUL;
+  string::size_type idx = 0;
+
+  int e = ParseTime(line, idx, start_time);
+
+  if (e)
+    return e;
+
+  // Detect any junk that follows the start time,
+  // but precedes the arrow symbol.
+
+  while (char c = line[idx]) {
+    if (c != kSPACE && c != kTAB)
+      return -1;
+    ++idx;
+  }
+
+  // Place a NUL character at the end of the line,
+  // so the scanner has a place to stop, and begin
+  // the scan just beyond the arrow token.
+
+  line.push_back(kNUL);
+  idx = arrow_pos + 3;
+
+  e = ParseTime(line, idx, stop_time);
+
+  if (e)
+    return e;
+
+  e = ParseSettings(line, idx, settings);
+
+  if (e)
+    return e;
+
+  return 0;  // success
+}
+
+int Parser::ParseTime(
+  const string& line,
+  string::size_type& idx,
+  Time* time) {
+  //
+  // WebVTT timestamp syntax comes in three flavors:
+  //  SS[.sss]
+  //  MM:SS[.sss]
+  //  HH:MM:SS[.sss]
+
+  if (idx == string::npos || idx >= line.length())
+    return -1;
+
+  // Consume any whitespace that precedes the timestamp.
+
+  while (char c = line[idx]) {
+    if (c != kSPACE && c != kTAB)
+      break;
+    ++idx;
+  }
+
+  Time& t = *time;
+
+  // Parse a generic number value.  We don't know which component
+  // of the time we have yet, until we do more parsing.
+
+  int val = ParseNumber(line, idx);
+
+  if (val < 0)  // error
+    return val;
+
+  // The presence of a colon character indicates that we have
+  // an [HH:]MM:SS style syntax.
+
+  if (line[idx] == ':') {
+    // We have either HH:MM:SS or MM:SS
+
+    // The value we just parsed is either the hours or minutes.
+    // It must be followed by another number value (that is
+    // either minutes or seconds).
+
+    const int first_val = val;
+
+    ++idx;  // consume colon
+
+    // Parse second value
+
+    val = ParseNumber(line, idx);
+
+    if (val < 0)
+      return val;
+
+    if (val >= 60)  // either MM or SS
+      return -1;
+
+    if (line[idx] == ':') {
+      // We have HH:MM:SS
+
+      t.hours = first_val;
+      t.minutes = val;  // vetted above
+
+      ++idx;  // consume MM:SS colon
+
+      // We have parsed the hours and minutes.
+      // We must now parse the seconds.
+
+      val = ParseNumber(line, idx);
+
+      if (val < 0)
+        return val;
+
+      if (val >= 60)  // SS part of HH:MM:SS
+        return -1;
+
+      t.seconds = val;
+    } else {
+      // We have MM:SS
+
+      // The implication here is that the hour value was omitted
+      // from the timestamp (because it was 0).
+
+      if (first_val >= 60)  // minutes
+        return -1;
+
+      t.hours = 0;
+      t.minutes = first_val;
+      t.seconds = val;  // vetted above
+    }
+  } else {
+    // We have SS (only)
+
+    // The time is expressed as total number of seconds,
+    // so the seconds value has no upper bound.
+
+    t.seconds = val;
+
+    // Convert SS to HH:MM:SS
+
+    t.minutes = t.seconds / 60;
+    t.seconds -= t.minutes * 60;
+
+    t.hours = t.minutes / 60;
+    t.minutes -= t.hours * 60;
+  }
+
+  // We have parsed the hours, minutes, and seconds.
+  // We must now parse the milliseconds.
+
+  if (line[idx] != '.') {  // no milliseconds
+    t.milliseconds = 0;
+  } else {
+    ++idx;  // consume FULL STOP
+
+    val = ParseNumber(line, idx);
+
+    if (val < 0)
+      return val;
+
+    if (val >= 1000)
+      return -1;
+
+    if (val < 10)
+      t.milliseconds = val * 100;
+    else if (val < 100)
+      t.milliseconds = val * 10;
+    else
+      t.milliseconds = val;
+  }
+
+  // We have parsed the time proper.  We must check for any
+  // junk that immediately follows the time specifier.
+
+  const char c = line[idx];
+
+  if (c != kNUL && c != kSPACE && c != kTAB)
+    return -1;
+
+  return 0;  // success
+}
+
+int Parser::ParseSettings(
+  const string& line,
+  string::size_type idx,
+  Cue::settings_t* settings) {
+  //
+  // Scanning starts at position idx, and stops when
+  // we consume a NUL character.
+
+  settings->clear();
+
+  if (idx == string::npos || idx >= line.length())
+    return -1;
+
+  for (;;) {
+    // Parse the whitespace that precedes the NAME:VALUE pair.
+
+    for (;;) {
+      const char c = line[idx];
+
+      if (c == kNUL)
+        return 0;  // success
+
+      if (c != kSPACE && c != kTAB)
+        break;
+
+      ++idx;  // consume whitespace
+    }
+
+    // There is something on the line for us to scan.
+
+    settings->push_back(Setting());
+    Setting& s = settings->back();
+
+    // Parse the NAME part of the settings pair.
+
+    for (;;) {
+      const char c = line[idx];
+
+      if (c == ':')  // we have reached end of NAME part
+        break;
+
+      if (c == kNUL || c == kSPACE || c == kTAB)
+        return -1;
+
+      s.name.push_back(c);
+
+      ++idx;
+    }
+
+    if (s.name.empty())
+      return -1;
+
+    ++idx;  // consume colon
+
+    // Parse the VALUE part of the settings pair.
+
+    for (;;) {
+      const char c = line[idx];
+
+      if (c == kNUL || c == kSPACE || c == kTAB)
+        break;
+
+      if (c == ':')  // suspicious when part of VALUE
+        return -1;   // TODO(matthewjheaney): verify this behavior
+
+      s.value.push_back(c);
+
+      ++idx;
+    }
+
+    if (s.value.empty())
+      return -1;
+  }
+}
+
+int Parser::ParseNumber(const std::string& line,
+                        std::string::size_type& idx) {
+  if (idx == string::npos || idx >= line.length())
+    return -1;
+
+  if (!isdigit(line[idx]))
+    return -1;
+
+  long long val = 0;  // NOLINT
+
+  while (isdigit(line[idx])) {
+    val *= 10;
+    val += static_cast<int>(line[idx] - '0');
+
+    if (val > INT_MAX)
+      return -1;
+
+    ++idx;
+  }
+
+  return static_cast<int>(val);
+}
+
+bool Time::operator==(const Time& rhs) const {
+  if (hours != rhs.hours)
+    return false;
+
+  if (minutes != rhs.minutes)
+    return false;
+
+  if (seconds != rhs.seconds)
+    return false;
+
+  return (milliseconds == rhs.milliseconds);
+}
+
+bool Time::operator<(const Time& rhs) const {
+  if (hours < rhs.hours)
+    return true;
+
+  if (hours > rhs.hours)
+    return false;
+
+  if (minutes < rhs.minutes)
+    return true;
+
+  if (minutes > rhs.minutes)
+    return false;
+
+  if (seconds < rhs.seconds)
+    return true;
+
+  if (seconds > rhs.seconds)
+    return false;
+
+  return (milliseconds < rhs.milliseconds);
+}
+
+bool Time::operator>(const Time& rhs) const {
+  return rhs.operator<(*this);
+}
+
+bool Time::operator<=(const Time& rhs) const {
+  return !this->operator>(rhs);
+}
+
+bool Time::operator>=(const Time& rhs) const {
+  return !this->operator<(rhs);
+}
+
+presentation_t Time::presentation() const {
+  const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
+  const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
+  const presentation_t s = 1000LL * presentation_t(seconds);
+  const presentation_t result = h + m + s + milliseconds;
+  return result;
+}
+
+Time& Time::presentation(presentation_t d) {
+  if (d < 0) {  // error
+    hours = 0;
+    minutes = 0;
+    seconds = 0;
+    milliseconds = 0;
+
+    return *this;
+  }
+
+  seconds = d / 1000;
+  milliseconds = d - 1000 * seconds;
+
+  minutes = seconds / 60;
+  seconds -= 60 * minutes;
+
+  hours = minutes / 60;
+  minutes -= 60 * hours;
+
+  return *this;
+}
+
+Time& Time::operator+=(presentation_t rhs) {
+  const presentation_t d = this->presentation();
+  const presentation_t dd = d + rhs;
+  this->presentation(dd);
+  return *this;
+}
+
+Time Time::operator+(presentation_t d) const {
+  Time t(*this);
+  t += d;
+  return t;
+}
+
+Time& Time::operator-=(presentation_t d) {
+  return this->operator+=(-d);
+}
+
+presentation_t Time::operator-(const Time& t) const {
+  const presentation_t rhs = t.presentation();
+  const presentation_t lhs = this->presentation();
+  const presentation_t result = lhs - rhs;
+  return result;
+}
+
+}  // namespace libwebvtt
--- a/webvttparser.h
+++ b/webvttparser.h
@ -0,0 +1,159 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef WEBVTTPARSER_H_  // NOLINT
+#define WEBVTTPARSER_H_
+
+#include <string>
+#include <list>
+
+namespace libwebvtt {
+
+class Reader {
+ public:
+  // Fetch a character from the stream. Return
+  // negative if error, positive if end-of-stream,
+  // and 0 if a character is available.
+  //
+  virtual int GetChar(char* c) = 0;
+
+ protected:
+  Reader();
+  virtual ~Reader();
+};
+
+// As measured in thousandths of a second,
+// e.g. a duration of 1 equals 0.001 seconds,
+// and a duration of 1000 equals 1 second.
+typedef long long presentation_t;  // NOLINT
+
+struct Time {
+  int hours;
+  int minutes;
+  int seconds;
+  int milliseconds;
+
+  bool operator==(const Time& rhs) const;
+  bool operator<(const Time& rhs) const;
+  bool operator>(const Time& rhs) const;
+  bool operator<=(const Time& rhs) const;
+  bool operator>=(const Time& rhs) const;
+
+  presentation_t presentation() const;
+  Time& presentation(presentation_t);
+
+  Time& operator+=(presentation_t);
+  Time operator+(presentation_t) const;
+
+  Time& operator-=(presentation_t);
+  presentation_t operator-(const Time&) const;
+};
+
+struct Setting {
+  std::string name;
+  std::string value;
+};
+
+struct Cue {
+  std::string identifier;
+
+  Time start_time;
+  Time stop_time;
+
+  typedef std::list<Setting> settings_t;
+  settings_t settings;
+
+  typedef std::list<std::string> payload_t;
+  payload_t payload;
+};
+
+class Parser {
+ public:
+  explicit Parser(Reader* r);
+  Reader* const reader_;
+
+  // Pre-parse enough of the stream to determine whether
+  // this is really a WEBVTT file. Returns 0 on success,
+  // negative if error.
+  int Init();
+
+  // Parse the next WebVTT cue from the stream. Returns 0 if
+  // an entire cue was parsed, negative if error, and positive
+  // at end-of-stream.
+  int Parse(Cue* cue);
+
+ private:
+  // Provides one character's worth of look-back, to facilitate scanning.
+  int unget_;
+
+  // Returns the next character in the stream, using the look-back character
+  // if present.
+  int GetChar(char* c);
+
+  // Puts a character back into the stream.
+  void UngetChar(char c);
+
+  // Check for presence of a UTF-8 BOM in the stream.  Returns
+  // negative if error, 0 on success, and positive at end-of-stream.
+  int ParseBOM();
+
+  // Character |c| was present in the stream, indicating end-of-line;
+  // consume the full end-of-line indication from the stream.  Returns
+  // negative if error, 0 on success, and positive at end-of-stream.
+  //
+  int ParseLineTerminator(char c);
+
+  // Consume a line of text from the stream, stripping off
+  // the line terminator characters.  Returns negative if error,
+  // 0 on success, and positive at end-of-stream.
+  //
+  int ParseLine(std::string* line);
+
+  // Parse the distinguished "cue timings" line, which includes the start
+  // and stop times and settings.  Argument |line| contains the complete
+  // line of text (as returned by ParseLine()), which the function is free
+  // to modify as it sees fit, to facilitate scanning.  Argument |arrow_pos|
+  // is the offset of the arrow token ("-->"), which indicates that this is
+  // the timings line.  Returns negative if error, 0 on success.
+  //
+  static int ParseTimingsLine(std::string& line,  // NOLINT
+                              std::string::size_type arrow_pos,
+                              Time* start_time,
+                              Time* stop_time,
+                              Cue::settings_t* settings);
+
+  // Parse a single time specifier (from the timings line), starting
+  // at the given offset; lexical scanning stops when a NUL character
+  // is detected. The function modifies offset |off| by the number of
+  // characters consumed.  Returns negative if error, 0 on success.
+  //
+  static int ParseTime(const std::string& line,
+                       std::string::size_type& off,
+                       Time* time);
+
+  // Parse the cue settings from the timings line, starting at the
+  // given offset.  Returns negative if error, 0 on success.
+  //
+  static int ParseSettings(const std::string& line,
+                           std::string::size_type off,
+                           Cue::settings_t* settings);
+
+  // Parse a non-negative integer from the characters in |line| beginning
+  // at offset |off|.  The function increments |off| by the number
+  // of characters consumed.  Returns the value, or negative if error.
+  static int ParseNumber(const std::string& line,
+                         std::string::size_type& off);
+
+ private:
+  Parser(const Parser&);
+  Parser& operator=(const Parser&);
+};
+
+}  // namespace libwebvtt
+
+#endif  // WEBVTTPARSER_H_  // NOLINT