From ababb64e0c792ad2a314245233db0833ba12036b Mon Sep 17 00:00:00 2001
From: Moritz Bunkus <moritz@bunkus.org>
Date: Tue, 20 Oct 2015 11:27:52 +0200
Subject: [PATCH] EbmlUnicodeString: don't read beyond end of string

The conversion from an UTF-8 encoded string into a wchar_t one was
reading from beyond the end of the source buffer if the length indicated
by a UTF-8 character's first byte exceeds the number of bytes actually
present afterwards.

Fixes the issue reported as Cisco TALOS-CAN-0036.
---
 ChangeLog                 |  9 ++++++
 src/EbmlUnicodeString.cpp | 65 ++++++++++++++++++++++++---------------
 2 files changed, 49 insertions(+), 25 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index a1c95dc..a5139c4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2015-10-20  Moritz Bunkus  <moritz@bunkus.org>
+
+        * EbmlUnicodeString::UpdateFromUTF8(): Fixed an invalid memory
+        access. When reading from a UTF-8 string in which the length
+        indicated by a UTF-8 character's first byte exceeds the string's
+        actual number of bytes the parser would access beyond the end of
+        the string resulting in a heap information leak. Fixes the issue
+        reported as Cisco TALOS-CAN-0036.
+
 2015-10-17  Moritz Bunkus  <moritz@bunkus.org>
 
         * Released v1.3.2.
diff --git a/src/EbmlUnicodeString.cpp b/src/EbmlUnicodeString.cpp
index 3255add..7b7cd8e 100644
--- a/src/EbmlUnicodeString.cpp
+++ b/src/EbmlUnicodeString.cpp
@@ -47,6 +47,21 @@ START_LIBEBML_NAMESPACE
 
 // ===================== UTFstring class ===================
 
+static unsigned int UTFCharLength(uint8 lead)
+{
+  if (lead < 0x80)
+    return 1;
+  else if ((lead >> 5) == 0x6)
+    return 2;
+  else if ((lead >> 4) == 0xe)
+    return 3;
+  else if ((lead >> 3) == 0x1e)
+    return 4;
+  else
+    // Invalid size?
+    return 0;
+}
+
 UTFstring::UTFstring()
   :_Length(0)
   ,_Data(NULL)
@@ -143,39 +158,39 @@ void UTFstring::UpdateFromUTF8()
   delete [] _Data;
   // find the size of the final UCS-2 string
   size_t i;
-  for (_Length=0, i=0; i<UTF8string.length(); _Length++) {
-    uint8 lead = static_cast<uint8>(UTF8string[i]);
-    if (lead < 0x80)
-      i++;
-    else if ((lead >> 5) == 0x6)
-      i += 2;
-    else if ((lead >> 4) == 0xe)
-      i += 3;
-    else if ((lead >> 3) == 0x1e)
-      i += 4;
+  const size_t SrcLength = UTF8string.length();
+  for (_Length=0, i=0; i<SrcLength; _Length++) {
+    const unsigned int CharLength = UTFCharLength(static_cast<uint8>(UTF8string[i]));
+    if ((CharLength >= 1) && (CharLength <= 4))
+      i += CharLength;
     else
       // Invalid size?
       break;
   }
   _Data = new wchar_t[_Length+1];
   size_t j;
-  for (j=0, i=0; i<UTF8string.length(); j++) {
-    uint8 lead = static_cast<uint8>(UTF8string[i]);
-    if (lead < 0x80) {
-      _Data[j] = lead;
-      i++;
-    } else if ((lead >> 5) == 0x6) {
-      _Data[j] = ((lead & 0x1F) << 6) + (UTF8string[i+1] & 0x3F);
-      i += 2;
-    } else if ((lead >> 4) == 0xe) {
-      _Data[j] = ((lead & 0x0F) << 12) + ((UTF8string[i+1] & 0x3F) << 6) + (UTF8string[i+2] & 0x3F);
-      i += 3;
-    } else if ((lead >> 3) == 0x1e) {
-      _Data[j] = ((lead & 0x07) << 18) + ((UTF8string[i+1] & 0x3F) << 12) + ((UTF8string[i+2] & 0x3F) << 6) + (UTF8string[i+3] & 0x3F);
-      i += 4;
-    } else
+  for (j=0, i=0; i<SrcLength; j++) {
+    const uint8 lead              = static_cast<uint8>(UTF8string[i]);
+    const unsigned int CharLength = UTFCharLength(lead);
+    if ((CharLength < 1) || (CharLength > 4))
       // Invalid char?
       break;
+
+    if ((i + CharLength) > SrcLength)
+      // Guard against invalid memory access beyond the end of the
+      // source buffer.
+      break;
+
+    if (CharLength == 1)
+      _Data[j] = lead;
+    else if (CharLength == 2)
+      _Data[j] = ((lead & 0x1F) << 6) + (UTF8string[i+1] & 0x3F);
+    else if (CharLength == 3)
+      _Data[j] = ((lead & 0x0F) << 12) + ((UTF8string[i+1] & 0x3F) << 6) + (UTF8string[i+2] & 0x3F);
+    else if (CharLength == 4)
+      _Data[j] = ((lead & 0x07) << 18) + ((UTF8string[i+1] & 0x3F) << 12) + ((UTF8string[i+2] & 0x3F) << 6) + (UTF8string[i+3] & 0x3F);
+
+    i += CharLength;
   }
   _Data[j] = 0;
 }