// Copyright (c) 2012 The WebM project authors. All Rights Reserved. // // Use of this source code is governed by a BSD-style license // that can be found in the LICENSE file in the root of the source // tree. An additional intellectual property rights grant can be found // in the file PATENTS. All contributing project authors may // be found in the AUTHORS file in the root of the source tree. #include "./webvttparser.h" // NOLINT #include using std::string; namespace libwebvtt { enum { kNUL = '\x00', kSPACE = ' ', kTAB = '\x09', kLF = '\x0A', kCR = '\x0D' }; Reader::~Reader() { } LineReader::~LineReader() { } int LineReader::GetLine(string* line_ptr) { if (line_ptr == NULL) return -1; string& ln = *line_ptr; ln.clear(); // Consume characters from the stream, until we // reach end-of-line (or end-of-stream). // The WebVTT spec states that lines may be // terminated in any of these three ways: // LF // CR // CR LF // We interrogate each character as we read it from the stream. // If we detect an end-of-line character, we consume the full // end-of-line indication, and we're done; otherwise, accumulate // the character and repeat. for (;;) { char c; const int e = GetChar(&c); if (e < 0) // error return e; if (e > 0) // EOF return (ln.empty()) ? 1 : 0; // We have a character, so we must first determine // whether we have reached end-of-line. if (c == kLF) return 0; // handle the easy end-of-line case immediately if (c == kCR) break; // handle the hard end-of-line case outside of loop if (c == '\xFE' || c == '\xFF') // not UTF-8 return -1; // To defend against pathological or malicious streams, we // cap the line length at some arbitrarily-large value: enum { kMaxLineLength = 10000 }; // arbitrary if (ln.length() >= kMaxLineLength) return -1; // We don't have an end-of-line character, so accumulate // the character in our line buffer. ln.push_back(c); } // We detected a CR. We must interrogate the next character // in the stream, to determine whether we have a LF (which // would make it part of this same line). char c; const int e = GetChar(&c); if (e < 0) // error return e; if (e > 0) // EOF return 0; // If next character in the stream is not a LF, return it // to the stream (because it's part of the next line). if (c != kLF) UngetChar(c); return 0; } Parser::Parser(Reader* r) : reader_(r), unget_(-1) { } Parser::~Parser() { } int Parser::Init() { int e = ParseBOM(); if (e < 0) // error return e; if (e > 0) // EOF return -1; // Parse "WEBVTT". We read from the stream one character at-a-time, in // order to defend against non-WebVTT streams (e.g. binary files) that don't // happen to comprise lines of text demarcated with line terminators. const char kId[] = "WEBVTT"; for (const char* p = kId; *p; ++p) { char c; e = GetChar(&c); if (e < 0) // error return e; if (e > 0) // EOF return -1; if (c != *p) return -1; } string line; e = GetLine(&line); if (e < 0) // error return e; if (e > 0) // EOF return 0; // weird but valid if (!line.empty()) { // Parse optional characters that follow "WEBVTT" const char c = line[0]; if (c != kSPACE && c != kTAB) return -1; } // The WebVTT spec requires that the "WEBVTT" line // be followed by an empty line (to separate it from // first cue). e = GetLine(&line); if (e < 0) // error return e; if (e > 0) // EOF return 0; // weird but we allow it if (!line.empty()) return -1; return 0; // success } int Parser::Parse(Cue* cue) { if (cue == NULL) return -1; // Parse first non-blank line string line; int e; for (;;) { e = GetLine(&line); if (e) // EOF is OK here return e; if (!line.empty()) break; } // A WebVTT cue comprises an optional cue identifier line followed // by a (non-optional) timings line. You determine whether you have // a timings line by scanning for the arrow token, the lexeme of which // may not appear in the cue identifier line. const char kArrow[] = "-->"; string::size_type arrow_pos = line.find(kArrow); if (arrow_pos != string::npos) { // We found a timings line, which implies that we don't have a cue // identifier. cue->identifier.clear(); } else { // We did not find a timings line, so we assume that we have a cue // identifier line, and then try again to find the cue timings on // the next line. cue->identifier.swap(line); e = GetLine(&line); if (e < 0) // error return e; if (e > 0) // EOF return -1; arrow_pos = line.find(kArrow); if (arrow_pos == string::npos) // not a timings line return -1; } e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time, &cue->settings); if (e) // error return e; // The cue payload comprises all the non-empty // lines that follow the timings line. Cue::payload_t& p = cue->payload; p.clear(); for (;;) { e = GetLine(&line); if (e < 0) // error return e; if (line.empty()) break; p.push_back(line); } if (p.empty()) return -1; return 0; // success } int Parser::GetChar(char* c) { if (unget_ >= 0) { *c = static_cast(unget_); unget_ = -1; return 0; } return reader_->GetChar(c); } void Parser::UngetChar(char c) { unget_ = static_cast(c); } int Parser::ParseBOM() { // Explanation of UTF-8 BOM: // http://en.wikipedia.org/wiki/Byte_order_mark static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM for (int i = 0; i < 3; ++i) { char c; int e = GetChar(&c); if (e < 0) // error return e; if (e > 0) // EOF return 1; if (c != BOM[i]) { if (i == 0) { // we don't have a BOM UngetChar(c); return 0; // success } // We started a BOM, so we must finish the BOM. return -1; // error } } return 0; // success } int Parser::ParseTimingsLine( string* line_ptr, string::size_type arrow_pos, Time* start_time, Time* stop_time, Cue::settings_t* settings) { if (line_ptr == NULL) return -1; string& line = *line_ptr; if (arrow_pos == string::npos || arrow_pos >= line.length()) return -1; // Place a NUL character at the start of the arrow token, in // order to demarcate the start time from remainder of line. line[arrow_pos] = kNUL; string::size_type idx = 0; int e = ParseTime(line, &idx, start_time); if (e) // error return e; // Detect any junk that follows the start time, // but precedes the arrow symbol. while (char c = line[idx]) { if (c != kSPACE && c != kTAB) return -1; ++idx; } // Place a NUL character at the end of the line, // so the scanner has a place to stop, and begin // the scan just beyond the arrow token. line.push_back(kNUL); idx = arrow_pos + 3; e = ParseTime(line, &idx, stop_time); if (e) // error return e; e = ParseSettings(line, idx, settings); if (e) // error return e; return 0; // success } int Parser::ParseTime( const string& line, string::size_type* idx_ptr, Time* time) { if (idx_ptr == NULL) return -1; string::size_type& idx = *idx_ptr; if (idx == string::npos || idx >= line.length()) return -1; if (time == NULL) return -1; // Consume any whitespace that precedes the timestamp. while (char c = line[idx]) { if (c != kSPACE && c != kTAB) break; ++idx; } // WebVTT timestamp syntax comes in three flavors: // SS[.sss] // MM:SS[.sss] // HH:MM:SS[.sss] // Parse a generic number value. We don't know which component // of the time we have yet, until we do more parsing. int val = ParseNumber(line, &idx); if (val < 0) // error return val; Time& t = *time; // The presence of a colon character indicates that we have // an [HH:]MM:SS style syntax. if (line[idx] == ':') { // We have either HH:MM:SS or MM:SS // The value we just parsed is either the hours or minutes. // It must be followed by another number value (that is // either minutes or seconds). const int first_val = val; ++idx; // consume colon // Parse second value val = ParseNumber(line, &idx); if (val < 0) return val; if (val >= 60) // either MM or SS return -1; if (line[idx] == ':') { // We have HH:MM:SS t.hours = first_val; t.minutes = val; // vetted above ++idx; // consume MM:SS colon // We have parsed the hours and minutes. // We must now parse the seconds. val = ParseNumber(line, &idx); if (val < 0) return val; if (val >= 60) // SS part of HH:MM:SS return -1; t.seconds = val; } else { // We have MM:SS // The implication here is that the hour value was omitted // from the timestamp (because it was 0). if (first_val >= 60) // minutes return -1; t.hours = 0; t.minutes = first_val; t.seconds = val; // vetted above } } else { // We have SS (only) // The time is expressed as total number of seconds, // so the seconds value has no upper bound. t.seconds = val; // Convert SS to HH:MM:SS t.minutes = t.seconds / 60; t.seconds -= t.minutes * 60; t.hours = t.minutes / 60; t.minutes -= t.hours * 60; } // We have parsed the hours, minutes, and seconds. // We must now parse the milliseconds. char c = line[idx]; // TODO(matthewjheaney): one option here is to slightly relax the // syntax rules for WebVTT timestamps, to permit the comma character // to also be used as the seconds/milliseconds separator. This // would handle streams that use localization conventions for // countries in Western Europe. For now we obey the rules specified // in the WebVTT spec (allow "full stop" only). const bool have_milliseconds = (c == '.'); if (!have_milliseconds) { t.milliseconds = 0; } else { ++idx; // consume FULL STOP val = ParseNumber(line, &idx); if (val < 0) return val; if (val >= 1000) return -1; if (val < 10) t.milliseconds = val * 100; else if (val < 100) t.milliseconds = val * 10; else t.milliseconds = val; } // We have parsed the time proper. We must check for any // junk that immediately follows the time specifier. c = line[idx]; if (c != kNUL && c != kSPACE && c != kTAB) return -1; return 0; // success } int Parser::ParseSettings( const string& line, string::size_type idx, Cue::settings_t* settings) { settings->clear(); if (idx == string::npos || idx >= line.length()) return -1; for (;;) { // We must parse a line comprising a sequence of 0 or more // NAME:VALUE pairs, separated by whitespace. The line iself is // terminated with a NUL char (indicating end-of-line). for (;;) { const char c = line[idx]; if (c == kNUL) // end-of-line return 0; // success if (c != kSPACE && c != kTAB) break; ++idx; // consume whitespace } // We have consumed the whitespace, and have not yet reached // end-of-line, so there is something on the line for us to parse. settings->push_back(Setting()); Setting& s = settings->back(); // Parse the NAME part of the settings pair. for (;;) { const char c = line[idx]; if (c == ':') // we have reached end of NAME part break; if (c == kNUL || c == kSPACE || c == kTAB) return -1; s.name.push_back(c); ++idx; } if (s.name.empty()) return -1; ++idx; // consume colon // Parse the VALUE part of the settings pair. for (;;) { const char c = line[idx]; if (c == kNUL || c == kSPACE || c == kTAB) break; if (c == ':') // suspicious when part of VALUE return -1; // TODO(matthewjheaney): verify this behavior s.value.push_back(c); ++idx; } if (s.value.empty()) return -1; } } int Parser::ParseNumber( const string& line, string::size_type* idx_ptr) { if (idx_ptr == NULL) return -1; string::size_type& idx = *idx_ptr; if (idx == string::npos || idx >= line.length()) return -1; if (!isdigit(line[idx])) return -1; int result = 0; while (isdigit(line[idx])) { const char c = line[idx]; const int i = c - '0'; if (result > INT_MAX / 10) return -1; result *= 10; if (result > INT_MAX - i) return -1; result += i; ++idx; } return result; } bool Time::operator==(const Time& rhs) const { if (hours != rhs.hours) return false; if (minutes != rhs.minutes) return false; if (seconds != rhs.seconds) return false; return (milliseconds == rhs.milliseconds); } bool Time::operator<(const Time& rhs) const { if (hours < rhs.hours) return true; if (hours > rhs.hours) return false; if (minutes < rhs.minutes) return true; if (minutes > rhs.minutes) return false; if (seconds < rhs.seconds) return true; if (seconds > rhs.seconds) return false; return (milliseconds < rhs.milliseconds); } bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); } bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); } bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); } presentation_t Time::presentation() const { const presentation_t h = 1000LL * 3600LL * presentation_t(hours); const presentation_t m = 1000LL * 60LL * presentation_t(minutes); const presentation_t s = 1000LL * presentation_t(seconds); const presentation_t result = h + m + s + milliseconds; return result; } Time& Time::presentation(presentation_t d) { if (d < 0) { // error hours = 0; minutes = 0; seconds = 0; milliseconds = 0; return *this; } seconds = d / 1000; milliseconds = d - 1000 * seconds; minutes = seconds / 60; seconds -= 60 * minutes; hours = minutes / 60; minutes -= 60 * hours; return *this; } Time& Time::operator+=(presentation_t rhs) { const presentation_t d = this->presentation(); const presentation_t dd = d + rhs; this->presentation(dd); return *this; } Time Time::operator+(presentation_t d) const { Time t(*this); t += d; return t; } Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); } presentation_t Time::operator-(const Time& t) const { const presentation_t rhs = t.presentation(); const presentation_t lhs = this->presentation(); const presentation_t result = lhs - rhs; return result; } } // namespace libwebvtt