diff --git a/webvttparser.cc b/webvttparser.cc new file mode 100644 index 0000000..1828b44 --- /dev/null +++ b/webvttparser.cc @@ -0,0 +1,671 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#include "webvttparser.h" // NOLINT +#include + +using std::string; + +namespace libwebvtt { + +enum { + kNUL = '\x00', + kSPACE = ' ', + kTAB = '\x09', + kLF = '\x0A', + kCR = '\x0D' +}; + +Reader::Reader() { +} + +Reader::~Reader() { +} + +Parser::Parser(Reader* r) + : reader_(r), unget_(-1) { +} + +int Parser::Init() { + int e = ParseBOM(); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return -1; + + // Parse "WEBVTT". We read from the stream one character at-a-time, in + // order to defend against non-WebVTT streams (e.g. binary files) that don't + // happen to comprise lines of text demarcated with line terminators. + + const char idstr[] = "WEBVTT"; + const char* p = idstr; + + while (*p) { + char c; + e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return -1; + + if (c != *p) + return -1; + + ++p; + } + + string line; + + e = ParseLine(&line); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 0; // weird but valid + + if (!line.empty()) { + // Parse optional characters that follow "WEBVTT" + + const char c = line[0]; + + if (c != kSPACE && c != kTAB) + return -1; + } + + // The WebVTT spec requires that the "WEBVTT" line + // be followed by an empty line (to separate it from + // first cue). + + e = ParseLine(&line); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 0; // weird but we allow it + + if (!line.empty()) + return -1; + + return 0; // success +} + +int Parser::Parse(Cue* cue) { + if (cue == NULL) + return -1; + + // Parse first non-blank line + + string line; + int e; + + for (;;) { + e = ParseLine(&line); + + if (e) + return e; + + if (!line.empty()) + break; + } + + // A WebVTT cue comprises an optional cue identifier line followed + // by a (non-optional) timings line. You determine whether you have + // a timings line by scanning for the arrow token, the lexeme of which + // may not appear in the cue identifier line. + + string::size_type off = line.find("-->"); + + if (off != string::npos) { // timings line + cue->identifier.clear(); + } else { + cue->identifier.swap(line); + + e = ParseLine(&line); + + if (e) + return e; + + off = line.find("-->"); + + if (off == string::npos) // not a timings line + return -1; + } + + e = ParseTimingsLine(line, + off, + &cue->start_time, + &cue->stop_time, + &cue->settings); + + if (e) + return e; + + // The cue payload comprises all the non-empty + // lines that follow the timings line. + + Cue::payload_t& p = cue->payload; + p.clear(); + + for (;;) { + e = ParseLine(&line); + + if (e < 0) // error + return e; + + if (line.empty()) + break; + + p.push_back(line); + } + + if (p.empty()) + return -1; + + return 0; // success +} + +int Parser::GetChar(char* c) { + if (unget_ >= 0) { + *c = static_cast(unget_); + unget_ = -1; + return 0; + } + + return reader_->GetChar(c); +} + +void Parser::UngetChar(char c) { + unget_ = static_cast(c); +} + +int Parser::ParseBOM() { + // Explanation of UTF-8 BOM: + // http://en.wikipedia.org/wiki/Byte_order_mark + + static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM + + for (int i = 0; i < 3; ++i) { + char c; + int e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 1; + + if (c != BOM[i]) { + if (i == 0) { // we don't have a BOM + UngetChar(c); + return 0; // success + } + + // We started a BOM, so we must finish the BOM. + return -1; // error + } + } + + return 0; // success +} + +int Parser::ParseLineTerminator(char c) { + // The WebVTT spec states that lines may be + // terminated in any of these three ways: + // LF + // CR + // CR LF + + if (c == kLF) + return 0; // success + + if (c != kCR) + return -1; // error + + // We detected a CR. We must interrogate the next character + // in the stream, to determine whether we have a LF. + + int e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return 0; // success + + if (c == kLF) + return 0; // success + + // The next character in the stream is not a LF, so + // return it to the stream; this completes this line. + + UngetChar(c); + return 0; // success +} + +int Parser::ParseLine(string* line) { + line->clear(); + + for (;;) { + char c; + int e = GetChar(&c); + + if (e < 0) // error + return e; + + if (e > 0) // EOF + return (line->empty()) ? 1 : 0; + + if (c == kLF || c == kCR) { + e = ParseLineTerminator(c); + + if (e < 0) // error + return e; + + return 0; + } + + line->push_back(c); + } +} + +int Parser::ParseTimingsLine( + string& line, + string::size_type arrow_pos, + Time* start_time, + Time* stop_time, + Cue::settings_t* settings) { + // + // Place a NUL character at the start of the arrow token, in + // order to demarcate the start time from remainder of line. + + if (arrow_pos == string::npos || arrow_pos >= line.length()) + return -1; + + line[arrow_pos] = kNUL; + string::size_type idx = 0; + + int e = ParseTime(line, idx, start_time); + + if (e) + return e; + + // Detect any junk that follows the start time, + // but precedes the arrow symbol. + + while (char c = line[idx]) { + if (c != kSPACE && c != kTAB) + return -1; + ++idx; + } + + // Place a NUL character at the end of the line, + // so the scanner has a place to stop, and begin + // the scan just beyond the arrow token. + + line.push_back(kNUL); + idx = arrow_pos + 3; + + e = ParseTime(line, idx, stop_time); + + if (e) + return e; + + e = ParseSettings(line, idx, settings); + + if (e) + return e; + + return 0; // success +} + +int Parser::ParseTime( + const string& line, + string::size_type& idx, + Time* time) { + // + // WebVTT timestamp syntax comes in three flavors: + // SS[.sss] + // MM:SS[.sss] + // HH:MM:SS[.sss] + + if (idx == string::npos || idx >= line.length()) + return -1; + + // Consume any whitespace that precedes the timestamp. + + while (char c = line[idx]) { + if (c != kSPACE && c != kTAB) + break; + ++idx; + } + + Time& t = *time; + + // Parse a generic number value. We don't know which component + // of the time we have yet, until we do more parsing. + + int val = ParseNumber(line, idx); + + if (val < 0) // error + return val; + + // The presence of a colon character indicates that we have + // an [HH:]MM:SS style syntax. + + if (line[idx] == ':') { + // We have either HH:MM:SS or MM:SS + + // The value we just parsed is either the hours or minutes. + // It must be followed by another number value (that is + // either minutes or seconds). + + const int first_val = val; + + ++idx; // consume colon + + // Parse second value + + val = ParseNumber(line, idx); + + if (val < 0) + return val; + + if (val >= 60) // either MM or SS + return -1; + + if (line[idx] == ':') { + // We have HH:MM:SS + + t.hours = first_val; + t.minutes = val; // vetted above + + ++idx; // consume MM:SS colon + + // We have parsed the hours and minutes. + // We must now parse the seconds. + + val = ParseNumber(line, idx); + + if (val < 0) + return val; + + if (val >= 60) // SS part of HH:MM:SS + return -1; + + t.seconds = val; + } else { + // We have MM:SS + + // The implication here is that the hour value was omitted + // from the timestamp (because it was 0). + + if (first_val >= 60) // minutes + return -1; + + t.hours = 0; + t.minutes = first_val; + t.seconds = val; // vetted above + } + } else { + // We have SS (only) + + // The time is expressed as total number of seconds, + // so the seconds value has no upper bound. + + t.seconds = val; + + // Convert SS to HH:MM:SS + + t.minutes = t.seconds / 60; + t.seconds -= t.minutes * 60; + + t.hours = t.minutes / 60; + t.minutes -= t.hours * 60; + } + + // We have parsed the hours, minutes, and seconds. + // We must now parse the milliseconds. + + if (line[idx] != '.') { // no milliseconds + t.milliseconds = 0; + } else { + ++idx; // consume FULL STOP + + val = ParseNumber(line, idx); + + if (val < 0) + return val; + + if (val >= 1000) + return -1; + + if (val < 10) + t.milliseconds = val * 100; + else if (val < 100) + t.milliseconds = val * 10; + else + t.milliseconds = val; + } + + // We have parsed the time proper. We must check for any + // junk that immediately follows the time specifier. + + const char c = line[idx]; + + if (c != kNUL && c != kSPACE && c != kTAB) + return -1; + + return 0; // success +} + +int Parser::ParseSettings( + const string& line, + string::size_type idx, + Cue::settings_t* settings) { + // + // Scanning starts at position idx, and stops when + // we consume a NUL character. + + settings->clear(); + + if (idx == string::npos || idx >= line.length()) + return -1; + + for (;;) { + // Parse the whitespace that precedes the NAME:VALUE pair. + + for (;;) { + const char c = line[idx]; + + if (c == kNUL) + return 0; // success + + if (c != kSPACE && c != kTAB) + break; + + ++idx; // consume whitespace + } + + // There is something on the line for us to scan. + + settings->push_back(Setting()); + Setting& s = settings->back(); + + // Parse the NAME part of the settings pair. + + for (;;) { + const char c = line[idx]; + + if (c == ':') // we have reached end of NAME part + break; + + if (c == kNUL || c == kSPACE || c == kTAB) + return -1; + + s.name.push_back(c); + + ++idx; + } + + if (s.name.empty()) + return -1; + + ++idx; // consume colon + + // Parse the VALUE part of the settings pair. + + for (;;) { + const char c = line[idx]; + + if (c == kNUL || c == kSPACE || c == kTAB) + break; + + if (c == ':') // suspicious when part of VALUE + return -1; // TODO(matthewjheaney): verify this behavior + + s.value.push_back(c); + + ++idx; + } + + if (s.value.empty()) + return -1; + } +} + +int Parser::ParseNumber(const std::string& line, + std::string::size_type& idx) { + if (idx == string::npos || idx >= line.length()) + return -1; + + if (!isdigit(line[idx])) + return -1; + + long long val = 0; // NOLINT + + while (isdigit(line[idx])) { + val *= 10; + val += static_cast(line[idx] - '0'); + + if (val > INT_MAX) + return -1; + + ++idx; + } + + return static_cast(val); +} + +bool Time::operator==(const Time& rhs) const { + if (hours != rhs.hours) + return false; + + if (minutes != rhs.minutes) + return false; + + if (seconds != rhs.seconds) + return false; + + return (milliseconds == rhs.milliseconds); +} + +bool Time::operator<(const Time& rhs) const { + if (hours < rhs.hours) + return true; + + if (hours > rhs.hours) + return false; + + if (minutes < rhs.minutes) + return true; + + if (minutes > rhs.minutes) + return false; + + if (seconds < rhs.seconds) + return true; + + if (seconds > rhs.seconds) + return false; + + return (milliseconds < rhs.milliseconds); +} + +bool Time::operator>(const Time& rhs) const { + return rhs.operator<(*this); +} + +bool Time::operator<=(const Time& rhs) const { + return !this->operator>(rhs); +} + +bool Time::operator>=(const Time& rhs) const { + return !this->operator<(rhs); +} + +presentation_t Time::presentation() const { + const presentation_t h = 1000LL * 3600LL * presentation_t(hours); + const presentation_t m = 1000LL * 60LL * presentation_t(minutes); + const presentation_t s = 1000LL * presentation_t(seconds); + const presentation_t result = h + m + s + milliseconds; + return result; +} + +Time& Time::presentation(presentation_t d) { + if (d < 0) { // error + hours = 0; + minutes = 0; + seconds = 0; + milliseconds = 0; + + return *this; + } + + seconds = d / 1000; + milliseconds = d - 1000 * seconds; + + minutes = seconds / 60; + seconds -= 60 * minutes; + + hours = minutes / 60; + minutes -= 60 * hours; + + return *this; +} + +Time& Time::operator+=(presentation_t rhs) { + const presentation_t d = this->presentation(); + const presentation_t dd = d + rhs; + this->presentation(dd); + return *this; +} + +Time Time::operator+(presentation_t d) const { + Time t(*this); + t += d; + return t; +} + +Time& Time::operator-=(presentation_t d) { + return this->operator+=(-d); +} + +presentation_t Time::operator-(const Time& t) const { + const presentation_t rhs = t.presentation(); + const presentation_t lhs = this->presentation(); + const presentation_t result = lhs - rhs; + return result; +} + +} // namespace libwebvtt diff --git a/webvttparser.h b/webvttparser.h new file mode 100644 index 0000000..ed47449 --- /dev/null +++ b/webvttparser.h @@ -0,0 +1,159 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef WEBVTTPARSER_H_ // NOLINT +#define WEBVTTPARSER_H_ + +#include +#include + +namespace libwebvtt { + +class Reader { + public: + // Fetch a character from the stream. Return + // negative if error, positive if end-of-stream, + // and 0 if a character is available. + // + virtual int GetChar(char* c) = 0; + + protected: + Reader(); + virtual ~Reader(); +}; + +// As measured in thousandths of a second, +// e.g. a duration of 1 equals 0.001 seconds, +// and a duration of 1000 equals 1 second. +typedef long long presentation_t; // NOLINT + +struct Time { + int hours; + int minutes; + int seconds; + int milliseconds; + + bool operator==(const Time& rhs) const; + bool operator<(const Time& rhs) const; + bool operator>(const Time& rhs) const; + bool operator<=(const Time& rhs) const; + bool operator>=(const Time& rhs) const; + + presentation_t presentation() const; + Time& presentation(presentation_t); + + Time& operator+=(presentation_t); + Time operator+(presentation_t) const; + + Time& operator-=(presentation_t); + presentation_t operator-(const Time&) const; +}; + +struct Setting { + std::string name; + std::string value; +}; + +struct Cue { + std::string identifier; + + Time start_time; + Time stop_time; + + typedef std::list settings_t; + settings_t settings; + + typedef std::list payload_t; + payload_t payload; +}; + +class Parser { + public: + explicit Parser(Reader* r); + Reader* const reader_; + + // Pre-parse enough of the stream to determine whether + // this is really a WEBVTT file. Returns 0 on success, + // negative if error. + int Init(); + + // Parse the next WebVTT cue from the stream. Returns 0 if + // an entire cue was parsed, negative if error, and positive + // at end-of-stream. + int Parse(Cue* cue); + + private: + // Provides one character's worth of look-back, to facilitate scanning. + int unget_; + + // Returns the next character in the stream, using the look-back character + // if present. + int GetChar(char* c); + + // Puts a character back into the stream. + void UngetChar(char c); + + // Check for presence of a UTF-8 BOM in the stream. Returns + // negative if error, 0 on success, and positive at end-of-stream. + int ParseBOM(); + + // Character |c| was present in the stream, indicating end-of-line; + // consume the full end-of-line indication from the stream. Returns + // negative if error, 0 on success, and positive at end-of-stream. + // + int ParseLineTerminator(char c); + + // Consume a line of text from the stream, stripping off + // the line terminator characters. Returns negative if error, + // 0 on success, and positive at end-of-stream. + // + int ParseLine(std::string* line); + + // Parse the distinguished "cue timings" line, which includes the start + // and stop times and settings. Argument |line| contains the complete + // line of text (as returned by ParseLine()), which the function is free + // to modify as it sees fit, to facilitate scanning. Argument |arrow_pos| + // is the offset of the arrow token ("-->"), which indicates that this is + // the timings line. Returns negative if error, 0 on success. + // + static int ParseTimingsLine(std::string& line, // NOLINT + std::string::size_type arrow_pos, + Time* start_time, + Time* stop_time, + Cue::settings_t* settings); + + // Parse a single time specifier (from the timings line), starting + // at the given offset; lexical scanning stops when a NUL character + // is detected. The function modifies offset |off| by the number of + // characters consumed. Returns negative if error, 0 on success. + // + static int ParseTime(const std::string& line, + std::string::size_type& off, + Time* time); + + // Parse the cue settings from the timings line, starting at the + // given offset. Returns negative if error, 0 on success. + // + static int ParseSettings(const std::string& line, + std::string::size_type off, + Cue::settings_t* settings); + + // Parse a non-negative integer from the characters in |line| beginning + // at offset |off|. The function increments |off| by the number + // of characters consumed. Returns the value, or negative if error. + static int ParseNumber(const std::string& line, + std::string::size_type& off); + + private: + Parser(const Parser&); + Parser& operator=(const Parser&); +}; + +} // namespace libwebvtt + +#endif // WEBVTTPARSER_H_ // NOLINT