sample_muxer: added WebVTT support

Change-Id: If72d31ca4828adf39e4637003979a314e5dda98e
2012-08-14 16:40:33 -07:00
parent 8f0c3333d1
commit 7ef225de9f
7 changed files with 496 additions and 45 deletions
--- a/5
+++ b/5
@@ -8,6 +8,7 @@ OBJSSO    := $(WEBMOBJS:.o=_so.o)
 OBJECTS1  := sample.o
 OBJECTS2  := sample_muxer.o
 OBJECTS3  := dumpvtt.o vttreader.o webvttparser.o
+OBJECTS4  := vttreader.o webvttparser.o sample_muxer_metadata.o
 INCLUDES  := -I.
 EXES      := samplemuxer sample dumpvtt

@@ -16,7 +17,7 @@ all: $(EXES)
 sample: sample.o $(LIBWEBMA)
 	$(CXX) $^ -o $@

-samplemuxer: sample_muxer.o $(LIBWEBMA)
+samplemuxer: sample_muxer.o $(LIBWEBMA) $(OBJECTS4)
 	$(CXX) $^ -o $@

 dumpvtt: $(OBJECTS3)
@@ -40,4 +41,4 @@ libwebm.so: $(OBJSSO)
 	$(CXX) -c $(CXXFLAGS) -fPIC $(INCLUDES) $< -o $@

 clean:
-	$(RM) -f $(OBJECTS1) $(OBJECTS2) $(OBJECTS3) $(OBJSA) $(OBJSSO) $(LIBWEBMA) $(LIBWEBMSO) $(EXES) Makefile.bak
+	$(RM) -f $(OBJECTS1) $(OBJECTS2) $(OBJECTS3) $(OBJECTS4) $(OBJSA) $(OBJSSO) $(LIBWEBMA) $(LIBWEBMSO) $(EXES) Makefile.bak
--- a/mkvmuxer.cpp
+++ b/mkvmuxer.cpp
@@ -1965,27 +1965,15 @@ int Segment::TestFrame(uint64 track_number,
  if (frame_timecode < last_cluster_timecode)  // should never happen
    return -1;  // error

-  // Handle the case when the frame we are testing has a timestamp
-  // equal to the cluster's timestamp.  This can happen if some
-  // non-video keyframe (that is, a WebVTT cue or audio block) first
-  // creates the initial cluster (at t=0), and then we test a video
-  // keyframe.  We don't want to create a new cluster just yet (see
-  // the predicate below, which specifies the creation of a new
-  // cluster when a video keyframe is detected); instead we want to
-  // force the frame to be written to the existing cluster.
-
-  if (frame_timecode == last_cluster_timecode)
-    return 0;
-
  // If the frame has a timestamp significantly larger than the last
  // cluster (in Matroska, cluster-relative timestamps are serialized
  // using a 16-bit signed integer), then we cannot write this frame
-  // that cluster, and so we must create a new cluster.
+  // to that cluster, and so we must create a new cluster.

  const int64 delta_timecode = frame_timecode - last_cluster_timecode;

  if (delta_timecode > std::numeric_limits<int16>::max())
-    return 1;
+    return 2;

  // We decide to create a new cluster when we have a video keyframe.
  // This will flush queued (audio) frames, and write the keyframe
@@ -2095,24 +2083,31 @@ bool Segment::MakeNewCluster(uint64 frame_timestamp_ns) {
 bool Segment::DoNewClusterProcessing(uint64 track_number,
                                     uint64 frame_timestamp_ns,
                                     bool is_key) {
-  // Based on the characteristics of the current frame and current
-  // cluster, decide whether to create a new cluster.
-  const int result = TestFrame(track_number, frame_timestamp_ns, is_key);
-  if (result < 0)  // error
-    return false;
+  for (;;) {
+    // Based on the characteristics of the current frame and current
+    // cluster, decide whether to create a new cluster.
+    const int result = TestFrame(track_number, frame_timestamp_ns, is_key);
+    if (result < 0)  // error
+      return false;

-  // A non-zero result means create a new cluster.
-  if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
-    return false;
+    // A non-zero result means create a new cluster.
+    if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
+      return false;

-  // Write queued (audio) frames.
-  const int frame_count = WriteFramesAll();
-  if (frame_count < 0)  // error
-    return false;
+    // Write queued (audio) frames.
+    const int frame_count = WriteFramesAll();
+    if (frame_count < 0)  // error
+      return false;

-  // Write the current frame to the current cluster (if TestFrame
-  // returns 0) or to a newly created cluster (TestFrame returns 1).
-  return true;
+    // Write the current frame to the current cluster (if TestFrame
+    // returns 0) or to a newly created cluster (TestFrame returns 1).
+    if (result <= 1)
+      return true;
+
+    // TestFrame returned 2, which means there was a large time
+    // difference between the cluster and the frame itself.  Do the
+    // test again, comparing the frame to the new cluster.
+  }
 }

 bool Segment::CheckHeaderInfo() {
--- a/mkvmuxer.hpp
+++ b/mkvmuxer.hpp
@@ -834,6 +834,7 @@ class Segment {
  //  -1 = error: an out-of-order frame was detected
  //  0 = do not create a new cluster, and write frame to the existing cluster
  //  1 = create a new cluster, and write frame to that new cluster
+  //  2 = create a new cluster, and re-run test
  int TestFrame(uint64 track_num, uint64 timestamp_ns, bool key) const;

  // Create a new cluster, using the earlier of the first enqueued
--- a/mkvmuxerutil.cpp
+++ b/mkvmuxerutil.cpp
@@ -387,6 +387,8 @@ uint64 WriteMetadataBlock(IMkvWriter* writer,
  // We use a single byte for the track number of the block, which
  // means the block header is exactly 4 bytes.

+  // TODO(matthewjheaney): use EbmlMasterElementSize and WriteEbmlMasterElement
+
  const uint64 block_payload_size = 4 + length;
  const int32 block_size = GetCodedUIntSize(block_payload_size);
  const uint64 block_elem_size = 1 + block_size + block_payload_size;
@@ -437,7 +439,7 @@ uint64 WriteMetadataBlock(IMkvWriter* writer,

  // Write Duration element

-  if (WriteID(writer, kMkvDuration))  // 1-byte ID size
+  if (WriteID(writer, kMkvBlockDuration))  // 1-byte ID size
    return 0;

  if (WriteUInt(writer, duration_payload_size))
--- a/sample_muxer.cpp
+++ b/sample_muxer.cpp
@@ -9,6 +9,8 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <list>
+#include <string>

 // libwebm parser includes
 #include "mkvreader.hpp"
@@ -19,6 +21,10 @@
 #include "mkvwriter.hpp"
 #include "mkvmuxerutil.hpp"

+#include "sample_muxer_metadata.h"
+
+using mkvmuxer::uint64;
+
 namespace {

 void Usage() {
@@ -47,13 +53,87 @@ void Usage() {
  printf("\n");
  printf("Cues options:\n");
  printf("  -output_cues_block_number <int> >0 outputs cue block number\n");
+  printf("\n");
+  printf("Metadata options:\n");
+  printf("  -webvtt-subtitles <vttfile>    "
+         "add WebVTT subtitles as metadata track\n");
+  printf("  -webvtt-captions <vttfile>     "
+         "add WebVTT captions as metadata track\n");
+  printf("  -webvtt-descriptions <vttfile> "
+         "add WebVTT descriptions as metadata track\n");
+  printf("  -webvtt-metadata <vttfile>     "
+         "add WebVTT subtitles as metadata track\n");
 }

-} //end namespace
+struct MetadataFile {
+  const char* name;
+  SampleMuxerMetadata::Kind kind;
+};
+
+typedef std::list<MetadataFile> metadata_files_t;
+
+// Cache the WebVTT filenames specified as command-line args.
+bool LoadMetadataFiles(
+    const metadata_files_t& files,
+    SampleMuxerMetadata* metadata) {
+  typedef metadata_files_t::const_iterator iter_t;
+
+  iter_t i = files.begin();
+  const iter_t j = files.end();
+
+  while (i != j) {
+    const metadata_files_t::value_type& v = *i++;
+
+    if (!metadata->Load(v.name, v.kind))
+      return false;
+  }
+
+  return true;
+}
+
+int ParseArgWebVTT(
+    char* argv[],
+    int* argv_index,
+    int argc_check,
+    metadata_files_t* metadata_files) {
+  int& i = *argv_index;
+
+  enum { kCount = 4 };
+  struct Arg { const char* name; SampleMuxerMetadata::Kind kind; };
+  const Arg args[kCount] = {
+    { "-webvtt-subtitles", SampleMuxerMetadata::kSubtitles },
+    { "-webvtt-captions", SampleMuxerMetadata::kCaptions },
+    { "-webvtt-descriptions", SampleMuxerMetadata::kDescriptions },
+    { "-webvtt-metadata", SampleMuxerMetadata::kMetadata }
+  };
+
+  for (int idx = 0; idx < kCount; ++idx) {
+    const Arg& arg = args[idx];
+
+    if (strcmp(arg.name, argv[i]) != 0)  // no match
+      continue;
+
+    ++i;  // consume arg name here
+
+    if (i > argc_check) {
+      printf("missing value for %s\n", arg.name);
+      return -1;  // error
+    }
+
+    MetadataFile f;
+    f.name = argv[i];  // arg value is consumed via caller's loop idx
+    f.kind = arg.kind;
+
+    metadata_files->push_back(f);
+    return 1;  // successfully parsed WebVTT arg
+  }
+
+  return 0;  // not a WebVTT arg
+}
+
+} // end namespace

 int main(int argc, char* argv[]) {
-  using mkvmuxer::uint64;
-
  char* input = NULL;
  char* output = NULL;

@@ -78,6 +158,8 @@ int main(int argc, char* argv[]) {
  uint64 display_height = 0;
  uint64 stereo_mode = 0;

+  metadata_files_t metadata_files;
+
  const int argc_check = argc - 1;
  for (int i = 1; i < argc; ++i) {
    char* end;
@@ -130,6 +212,9 @@ int main(int argc, char* argv[]) {
               i < argc_check) {
      output_cues_block_number =
          strtol(argv[++i], &end, 10) == 0 ? false : true;
+    } else if (int e = ParseArgWebVTT(argv, &i, argc_check, &metadata_files)) {
+      if (e < 0)
+        return EXIT_FAILURE;
    }
  }

@@ -204,12 +289,13 @@ int main(int argc, char* argv[]) {
  info->set_writing_app("sample_muxer");

  // Set Tracks element attributes
-  enum { kVideoTrack = 1, kAudioTrack = 2 };
  const mkvparser::Tracks* const parser_tracks = parser_segment->GetTracks();
  unsigned long i = 0;
  uint64 vid_track = 0; // no track added
  uint64 aud_track = 0; // no track added

+  using mkvparser::Track;
+
  while (i != parser_tracks->GetTracksCount()) {
    int track_num = i++;
    if (switch_tracks)
@@ -226,7 +312,7 @@ int main(int argc, char* argv[]) {

    const long long track_type = parser_track->GetType();

-    if (track_type == kVideoTrack && output_video) {
+    if (track_type == Track::kVideo && output_video) {
      // Get the video track from the parser
      const mkvparser::VideoTrack* const pVideoTrack =
          static_cast<const mkvparser::VideoTrack*>(parser_track);
@@ -264,7 +350,7 @@ int main(int argc, char* argv[]) {
      if (rate > 0.0) {
        video->set_frame_rate(rate);
      }
-    } else if (track_type == kAudioTrack && output_audio) {
+    } else if (track_type == Track::kAudio && output_audio) {
      // Get the audio track from the parser
      const mkvparser::AudioTrack* const pAudioTrack =
          static_cast<const mkvparser::AudioTrack*>(parser_track);
@@ -307,6 +393,17 @@ int main(int argc, char* argv[]) {
    }
  }

+  // We have created all the video and audio tracks.  If any WebVTT
+  // files were specified as command-line args, then parse them and
+  // add a track to the output file corresponding to each metadata
+  // input file.
+
+  SampleMuxerMetadata metadata;
+  metadata.Init(&muxer_segment);
+
+  if (!LoadMetadataFiles(metadata_files, &metadata))
+    return EXIT_FAILURE;
+
  // Set Cues element attributes
  mkvmuxer::Cues* const cues = muxer_segment.GetCues();
  cues->set_output_block_number(output_cues_block_number);
@@ -339,11 +436,16 @@ int main(int argc, char* argv[]) {
          parser_tracks->GetTrackByNumber(
              static_cast<unsigned long>(trackNum));
      const long long track_type = parser_track->GetType();
+      const long long time_ns = block->GetTime(cluster);

-      if ((track_type == kAudioTrack && output_audio) ||
-          (track_type == kVideoTrack && output_video)) {
+      // Flush any metadata frames to the output file, before we write
+      // the current block.
+      if (!metadata.Write(time_ns))
+        return EXIT_FAILURE;
+
+      if ((track_type == Track::kAudio && output_audio) ||
+          (track_type == Track::kVideo && output_video)) {
        const int frame_count = block->GetFrameCount();
-        const long long time_ns = block->GetTime(cluster);
        const bool is_key = block->IsKey();

        for (int i = 0; i < frame_count; ++i) {
@@ -361,7 +463,7 @@ int main(int argc, char* argv[]) {
            return EXIT_FAILURE;

          uint64 track_num = vid_track;
-          if (track_type == kAudioTrack)
+          if (track_type == Track::kAudio)
            track_num = aud_track;

          if (!muxer_segment.AddFrame(data,
@@ -387,6 +489,11 @@ int main(int argc, char* argv[]) {
    cluster = parser_segment->GetNext(cluster);
  }

+  // We have exhausted all video and audio frames in the input file.
+  // Flush any remaining metadata frames to the output file.
+  if (!metadata.Write(-1))
+    return EXIT_FAILURE;
+
  muxer_segment.Finalize();

  delete [] data;
@@ -397,6 +504,3 @@ int main(int argc, char* argv[]) {

  return EXIT_SUCCESS;
 }
-
-
-
--- a/sample_muxer_metadata.cc
+++ b/sample_muxer_metadata.cc
@@ -0,0 +1,236 @@
+#include "sample_muxer_metadata.h"
+#include <string>
+#include "vttreader.h"
+
+using std::string;
+
+SampleMuxerMetadata::SampleMuxerMetadata() : segment_(NULL) {
+}
+
+void SampleMuxerMetadata::Init(mkvmuxer::Segment* s) {
+  segment_ = s;
+}
+
+bool SampleMuxerMetadata::Load(const char* file, Kind kind) {
+  mkvmuxer::uint64 track_num;
+
+  if (!AddTrack(kind, &track_num)) {
+    printf("Unable to add track for WebVTT file \"%s\"\n", file);
+    return false;
+  }
+
+  return Parse(file, kind, track_num);
+}
+
+bool SampleMuxerMetadata::Write(mkvmuxer::int64 time_ns) {
+  typedef cues_set_t::iterator iter_t;
+
+  iter_t i = cues_set_.begin();
+  const iter_t j = cues_set_.end();
+
+  while (i != j) {
+    const cues_set_t::value_type& v = *i;
+
+    if (time_ns >= 0 && v > time_ns)
+      return true;  // nothing else to do just yet
+
+    if (!v.Write(segment_)) {
+      printf("\nCould not add metadata.\n");
+      return false;  // error
+    }
+
+    cues_set_.erase(i++);
+  }
+
+  return true;
+}
+
+bool SampleMuxerMetadata::AddTrack(
+    Kind kind,
+    mkvmuxer::uint64* track_num) {
+  *track_num = 0;
+
+  // Track number value 0 means "let muxer choose track number"
+  mkvmuxer::Track* const track = segment_->AddTrack(0);
+
+  if (track == NULL)  // error
+    return false;
+
+  // Return the track number value chosen by the muxer
+  *track_num = track->number();
+
+  int type;
+  const char* codec_id;
+
+  switch (kind) {
+  case kSubtitles:
+    type = 0x11;
+    codec_id = "D_WEBVTT/SUBTITLES";
+    break;
+
+  case kCaptions:
+    type = 0x11;
+    codec_id = "D_WEBVTT/CAPTIONS";
+    break;
+
+  case kDescriptions:
+    type = 0x21;
+    codec_id = "D_WEBVTT/DESCRIPTIONS";
+    break;
+
+  case kMetadata:
+    type = 0x21;
+    codec_id = "D_WEBVTT/METADATA";
+    break;
+
+  default:
+    return false;
+  }
+
+  track->set_type(type);
+  track->set_codec_id(codec_id);
+
+  // TODO(matthewjheaney): set name and language
+
+  return true;
+}
+
+bool SampleMuxerMetadata::Parse(
+    const char* file,
+    Kind /* kind */,
+    mkvmuxer::uint64 track_num) {
+  libwebvtt::VttReader r;
+  int e = r.Open(file);
+
+  if (e) {
+    printf("Unable to open WebVTT file: \"%s\"\n", file);
+    return false;
+  }
+
+  libwebvtt::Parser p(&r);
+
+  e = p.Init();
+
+  if (e < 0) {  // error
+    printf("Error parsing WebVTT file: \"%s\"\n", file);
+    return false;
+  }
+
+  SortableCue cue;
+  cue.track_num = track_num;
+
+  libwebvtt::Time t;
+  t.hours = -1;
+
+  for (;;) {
+    cue_t& c = cue.cue;
+    e = p.Parse(&c);
+
+    if (e < 0) {  // error
+      printf("Error parsing WebVTT file: \"%s\"\n", file);
+      return false;
+    }
+
+    if (e > 0)  // EOF
+      return true;
+
+    if (c.start_time >= t) {
+      t = c.start_time;
+    } else {
+      printf("bad WebVTT cue timestamp (out-of-order)\n");
+      return false;
+    }
+
+    if (c.stop_time < c.start_time) {
+      printf("bad WebVTT cue timestamp (stop < start)\n");
+      return false;
+    }
+
+    cues_set_.insert(cue);
+  }
+}
+
+void SampleMuxerMetadata::MakeFrame(const cue_t& c, string* pf) {
+  pf->clear();
+  WriteCueIdentifier(c.identifier, pf);
+  WriteCueSettings(c.settings, pf);
+  WriteCuePayload(c.payload, pf);
+}
+
+void SampleMuxerMetadata::WriteCueIdentifier(
+    const string& identifier,
+    string* pf) {
+  pf->append(identifier);
+  pf->push_back('\x0A');  // LF
+}
+
+void SampleMuxerMetadata::WriteCueSettings(
+    const cue_t::settings_t& settings,
+    string* pf) {
+  if (settings.empty()) {
+    pf->push_back('\x0A');  // LF
+    return;
+  }
+
+  typedef cue_t::settings_t::const_iterator iter_t;
+
+  iter_t i = settings.begin();
+  const iter_t j = settings.end();
+
+  for (;;) {
+    const libwebvtt::Setting& setting = *i++;
+
+    pf->append(setting.name);
+    pf->push_back(':');
+    pf->append(setting.value);
+
+    if (i == j)
+      break;
+
+    pf->push_back(' ');  // separate settings with whitespace
+  }
+
+  pf->push_back('\x0A');  // LF
+}
+
+void SampleMuxerMetadata::WriteCuePayload(
+    const cue_t::payload_t& payload,
+    string* pf) {
+  typedef cue_t::payload_t::const_iterator iter_t;
+
+  iter_t i = payload.begin();
+  const iter_t j = payload.end();
+
+  while (i != j) {
+    const string& line = *i++;
+    pf->append(line);
+    pf->push_back('\x0A');  // LF
+  }
+}
+
+bool SampleMuxerMetadata::SortableCue::Write(
+    mkvmuxer::Segment* segment) const {
+  // Cue start time expressed in milliseconds
+  const mkvmuxer::int64 start_ms = cue.start_time.presentation();
+
+  // Cue start time expressed in nanoseconds (MKV time)
+  const mkvmuxer::int64 start_ns = start_ms * 1000000;
+
+  // Cue stop time expressed in milliseconds
+  const mkvmuxer::int64 stop_ms = cue.stop_time.presentation();
+
+  // Cue stop time expressed in nanonseconds
+  const mkvmuxer::int64 stop_ns = stop_ms * 1000000;
+
+  // Metadata blocks always specify the block duration.
+  const mkvmuxer::int64 duration_ns = stop_ns - start_ns;
+
+  string frame;
+  MakeFrame(cue, &frame);
+
+  typedef const mkvmuxer::uint8* data_t;
+  const data_t buf = reinterpret_cast<data_t>(frame.data());
+  const mkvmuxer::uint64 len = frame.length();
+
+  return segment->AddMetadata(buf, len, track_num, start_ns, duration_ns);
+}
--- a/sample_muxer_metadata.h
+++ b/sample_muxer_metadata.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef SAMPLE_MUXER_METADATA_H_  // NOLINT
+#define SAMPLE_MUXER_METADATA_H_
+
+#include <list>
+#include <set>
+
+#include "mkvmuxer.hpp"
+#include "webvttparser.h"
+
+class SampleMuxerMetadata {
+ public:
+  enum Kind {
+    kSubtitles,
+    kCaptions,
+    kDescriptions,
+    kMetadata
+  };
+
+  SampleMuxerMetadata();
+
+  // Bind this metadata object to the muxer instance.
+  void Init(mkvmuxer::Segment* segment);
+
+  // Parse the WebVTT file |filename| having the indicated |kind|, and
+  // create a corresponding track in the segment.  Returns false on
+  // error.
+  bool Load(const char* filename, Kind kind);
+
+  // Write any WebVTT cues whose time is less or equal to |time_ns| as
+  // a metadata block in its corresponding track.  If |time_ns| is
+  // negative, write all remaining cues. Returns false on error.
+  bool Write(mkvmuxer::int64 time_ns);
+
+ private:
+  typedef libwebvtt::Cue cue_t;
+
+  // Used to sort cues as they are loaded.
+  struct SortableCue {
+    bool operator>(mkvmuxer::int64 time_ns) const {
+      // Cue start time expressed in milliseconds
+      const mkvmuxer::int64 start_ms = cue.start_time.presentation();
+
+      // Cue start time expressed in nanoseconds (MKV time)
+      const mkvmuxer::int64 start_ns = start_ms * 1000000;
+
+      return (start_ns > time_ns);
+    }
+
+    bool operator<(const SortableCue& rhs) const {
+      if (cue.start_time < rhs.cue.start_time)
+        return true;
+
+      if (cue.start_time > rhs.cue.start_time)
+        return false;
+
+      return (track_num < rhs.track_num);
+    }
+
+    // Write this cue as a metablock to |segment|.  Returns false on
+    // error.
+    bool Write(mkvmuxer::Segment* segment) const;
+
+    mkvmuxer::uint64 track_num;
+    cue_t cue;
+  };
+
+  typedef std::multiset<SortableCue> cues_set_t;
+
+  // Add a metadata track to the segment having the indicated |kind|,
+  // returning the |track_num| that has been chosen for this track.
+  // Returns false on error.
+  bool AddTrack(Kind kind, mkvmuxer::uint64* track_num);
+
+  // Parse the WebVTT |file| having the indicated |kind| and
+  // |track_num|, adding each parsed cue to cues set.  Returns false
+  // on error.
+  bool Parse(const char* file, Kind kind, mkvmuxer::uint64 track_num);
+
+  // Converts a WebVTT cue to a Matroska metadata block.
+  static void MakeFrame(const cue_t& cue, std::string* frame);
+
+  // Populate the cue identifier part of the metadata block.
+  static void WriteCueIdentifier(const std::string& identifier,
+                                 std::string* frame);
+
+  // Populate the cue settings part of the metadata block.
+  static void WriteCueSettings(const cue_t::settings_t& settings,
+                               std::string* frame);
+
+  // Populate the payload part of the metadata block.
+  static void WriteCuePayload(const cue_t::payload_t& payload,
+                              std::string* frame);
+
+  mkvmuxer::Segment* segment_;
+
+  // Set of cues ordered by time and then by track number.
+  cues_set_t cues_set_;
+
+  // Disable copy ctor and copy assign.
+  SampleMuxerMetadata(const SampleMuxerMetadata&);
+  SampleMuxerMetadata& operator=(const SampleMuxerMetadata&);
+};
+
+#endif  // SAMPLE_MUXER_METADATA_H_  // NOLINT