webvttparser: added LineReader class

Previously the Parser class had an internal function to parse
the character stream into separate lines.  This functionality
was separated out into its own class, LineReader, in order
to make this functionality available to clients too.

Change-Id: Ic5a1b0b73d7a253cf21cb6b4804b4941fd69c8ab
This commit is contained in:
Matthew Heaney
2012-09-28 15:32:40 -07:00
parent 4f494f6dd4
commit 49078292b4
2 changed files with 224 additions and 214 deletions

View File

@@ -21,212 +21,13 @@ enum {
kCR = '\x0D'
};
Reader::Reader() {
}
Reader::~Reader() {
}
Parser::Parser(Reader* r) : reader_(r), unget_(-1) {
LineReader::~LineReader() {
}
int Parser::Init() {
int e = ParseBOM();
if (e < 0) // error
return e;
if (e > 0) // EOF
return -1;
// Parse "WEBVTT". We read from the stream one character at-a-time, in
// order to defend against non-WebVTT streams (e.g. binary files) that don't
// happen to comprise lines of text demarcated with line terminators.
const char kId[] = "WEBVTT";
for (const char* p = kId; *p; ++p) {
char c;
e = GetChar(&c);
if (e < 0) // error
return e;
if (e > 0) // EOF
return -1;
if (c != *p)
return -1;
}
string line;
e = ParseLine(&line);
if (e < 0) // error
return e;
if (e > 0) // EOF
return 0; // weird but valid
if (!line.empty()) {
// Parse optional characters that follow "WEBVTT"
const char c = line[0];
if (c != kSPACE && c != kTAB)
return -1;
}
// The WebVTT spec requires that the "WEBVTT" line
// be followed by an empty line (to separate it from
// first cue).
e = ParseLine(&line);
if (e < 0) // error
return e;
if (e > 0) // EOF
return 0; // weird but we allow it
if (!line.empty())
return -1;
return 0; // success
}
int Parser::Parse(Cue* cue) {
if (cue == NULL)
return -1;
// Parse first non-blank line
string line;
int e;
for (;;) {
e = ParseLine(&line);
if (e) // EOF is OK here
return e;
if (!line.empty())
break;
}
// A WebVTT cue comprises an optional cue identifier line followed
// by a (non-optional) timings line. You determine whether you have
// a timings line by scanning for the arrow token, the lexeme of which
// may not appear in the cue identifier line.
const char kArrow[] = "-->";
string::size_type arrow_pos = line.find(kArrow);
if (arrow_pos != string::npos) {
// We found a timings line, which implies that we don't have a cue
// identifier.
cue->identifier.clear();
} else {
// We did not find a timings line, so we assume that we have a cue
// identifier line, and then try again to find the cue timings on
// the next line.
cue->identifier.swap(line);
e = ParseLine(&line);
if (e < 0) // error
return e;
if (e > 0) // EOF
return -1;
arrow_pos = line.find(kArrow);
if (arrow_pos == string::npos) // not a timings line
return -1;
}
e = ParseTimingsLine(&line,
arrow_pos,
&cue->start_time,
&cue->stop_time,
&cue->settings);
if (e) // error
return e;
// The cue payload comprises all the non-empty
// lines that follow the timings line.
Cue::payload_t& p = cue->payload;
p.clear();
for (;;) {
e = ParseLine(&line);
if (e < 0) // error
return e;
if (line.empty())
break;
p.push_back(line);
}
if (p.empty())
return -1;
return 0; // success
}
int Parser::GetChar(char* c) {
if (unget_ >= 0) {
*c = static_cast<char>(unget_);
unget_ = -1;
return 0;
}
return reader_->GetChar(c);
}
void Parser::UngetChar(char c) {
unget_ = static_cast<unsigned char>(c);
}
int Parser::ParseBOM() {
// Explanation of UTF-8 BOM:
// http://en.wikipedia.org/wiki/Byte_order_mark
static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM
for (int i = 0; i < 3; ++i) {
char c;
int e = GetChar(&c);
if (e < 0) // error
return e;
if (e > 0) // EOF
return 1;
if (c != BOM[i]) {
if (i == 0) { // we don't have a BOM
UngetChar(c);
return 0; // success
}
// We started a BOM, so we must finish the BOM.
return -1; // error
}
}
return 0; // success
}
int Parser::ParseLine(string* line_ptr) {
int LineReader::GetLine(string* line_ptr) {
if (line_ptr == NULL)
return -1;
@@ -299,6 +100,208 @@ int Parser::ParseLine(string* line_ptr) {
return 0;
}
Parser::Parser(Reader* r) : reader_(r), unget_(-1) {
}
Parser::~Parser() {
}
int Parser::Init() {
int e = ParseBOM();
if (e < 0) // error
return e;
if (e > 0) // EOF
return -1;
// Parse "WEBVTT". We read from the stream one character at-a-time, in
// order to defend against non-WebVTT streams (e.g. binary files) that don't
// happen to comprise lines of text demarcated with line terminators.
const char kId[] = "WEBVTT";
for (const char* p = kId; *p; ++p) {
char c;
e = GetChar(&c);
if (e < 0) // error
return e;
if (e > 0) // EOF
return -1;
if (c != *p)
return -1;
}
string line;
e = GetLine(&line);
if (e < 0) // error
return e;
if (e > 0) // EOF
return 0; // weird but valid
if (!line.empty()) {
// Parse optional characters that follow "WEBVTT"
const char c = line[0];
if (c != kSPACE && c != kTAB)
return -1;
}
// The WebVTT spec requires that the "WEBVTT" line
// be followed by an empty line (to separate it from
// first cue).
e = GetLine(&line);
if (e < 0) // error
return e;
if (e > 0) // EOF
return 0; // weird but we allow it
if (!line.empty())
return -1;
return 0; // success
}
int Parser::Parse(Cue* cue) {
if (cue == NULL)
return -1;
// Parse first non-blank line
string line;
int e;
for (;;) {
e = GetLine(&line);
if (e) // EOF is OK here
return e;
if (!line.empty())
break;
}
// A WebVTT cue comprises an optional cue identifier line followed
// by a (non-optional) timings line. You determine whether you have
// a timings line by scanning for the arrow token, the lexeme of which
// may not appear in the cue identifier line.
const char kArrow[] = "-->";
string::size_type arrow_pos = line.find(kArrow);
if (arrow_pos != string::npos) {
// We found a timings line, which implies that we don't have a cue
// identifier.
cue->identifier.clear();
} else {
// We did not find a timings line, so we assume that we have a cue
// identifier line, and then try again to find the cue timings on
// the next line.
cue->identifier.swap(line);
e = GetLine(&line);
if (e < 0) // error
return e;
if (e > 0) // EOF
return -1;
arrow_pos = line.find(kArrow);
if (arrow_pos == string::npos) // not a timings line
return -1;
}
e = ParseTimingsLine(&line,
arrow_pos,
&cue->start_time,
&cue->stop_time,
&cue->settings);
if (e) // error
return e;
// The cue payload comprises all the non-empty
// lines that follow the timings line.
Cue::payload_t& p = cue->payload;
p.clear();
for (;;) {
e = GetLine(&line);
if (e < 0) // error
return e;
if (line.empty())
break;
p.push_back(line);
}
if (p.empty())
return -1;
return 0; // success
}
int Parser::GetChar(char* c) {
if (unget_ >= 0) {
*c = static_cast<char>(unget_);
unget_ = -1;
return 0;
}
return reader_->GetChar(c);
}
void Parser::UngetChar(char c) {
unget_ = static_cast<unsigned char>(c);
}
int Parser::ParseBOM() {
// Explanation of UTF-8 BOM:
// http://en.wikipedia.org/wiki/Byte_order_mark
static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM
for (int i = 0; i < 3; ++i) {
char c;
int e = GetChar(&c);
if (e < 0) // error
return e;
if (e > 0) // EOF
return 1;
if (c != BOM[i]) {
if (i == 0) { // we don't have a BOM
UngetChar(c);
return 0; // success
}
// We started a BOM, so we must finish the BOM.
return -1; // error
}
}
return 0; // success
}
int Parser::ParseTimingsLine(
string* line_ptr,
string::size_type arrow_pos,

View File

@@ -19,14 +19,26 @@ class Reader {
// Fetch a character from the stream. Return
// negative if error, positive if end-of-stream,
// and 0 if a character is available.
//
virtual int GetChar(char* c) = 0;
protected:
Reader();
virtual ~Reader();
};
class LineReader : protected Reader {
public:
// Consume a line of text from the stream, stripping off
// the line terminator characters. Returns negative if error,
// 0 on success, and positive at end-of-stream.
int GetLine(std::string* line);
protected:
virtual ~LineReader();
// Puts a character back into the stream.
virtual void UngetChar(char c) = 0;
};
// As measured in thousandths of a second,
// e.g. a duration of 1 equals 0.001 seconds,
// and a duration of 1000 equals 1 second.
@@ -72,9 +84,10 @@ struct Cue {
payload_t payload;
};
class Parser {
class Parser : private LineReader {
public:
explicit Parser(Reader* r);
virtual ~Parser();
// Pre-parse enough of the stream to determine whether
// this is really a WEBVTT file. Returns 0 on success,
@@ -88,22 +101,16 @@ class Parser {
private:
// Returns the next character in the stream, using the look-back character
// if present.
int GetChar(char* c);
// if present (as per Reader::GetChar).
virtual int GetChar(char* c);
// Puts a character back into the stream.
void UngetChar(char c);
// Puts a character back into the stream (as per LineReader::UngetChar).
virtual void UngetChar(char c);
// Check for presence of a UTF-8 BOM in the stream. Returns
// negative if error, 0 on success, and positive at end-of-stream.
int ParseBOM();
// Consume a line of text from the stream, stripping off
// the line terminator characters. Returns negative if error,
// 0 on success, and positive at end-of-stream.
//
int ParseLine(std::string* line);
// Parse the distinguished "cue timings" line, which includes the start
// and stop times and settings. Argument |line| contains the complete
// line of text (as returned by ParseLine()), which the function is free