Merge pull request #197 from tyler92/fix-buffer-overflow

Fix buffer overflow in u8_strlen
2024-12-12 10:13:51 +01:00 · 2024-10-22 10:03:24 +11:00 · 2024-10-22 10:03:24 +11:00 · cc6ca369d3
commit cc6ca369d3
parent ad1e184b1c b7c051fbc1
3 changed files with 72 additions and 31 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -126,6 +126,7 @@ if(valijson_BUILD_TESTS)
        tests/test_validator.cpp
        tests/test_validator_with_custom_regular_expression_engine.cpp
        tests/test_yaml_cpp_adapter.cpp
        tests/test_utf8_utils.cpp
    )
    set(TEST_LIBS gtest gtest_main jsoncpp json11 yamlcpp)
--- a/include/valijson/utils/utf8_utils.hpp
+++ b/include/valijson/utils/utf8_utils.hpp
@ -14,45 +14,34 @@
 namespace valijson {
 namespace utils {
 static const uint32_t offsetsFromUTF8[6] = {
    0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
 };
 /* is c the start of a utf8 sequence? */
-inline bool isutf(char c) {
+inline bool isutf(char c)
    return ((c & 0xC0) != 0x80);
 }
 /* reads the next utf-8 sequence out of a string, updating an index */
 inline uint64_t u8_nextchar(const char *s, uint64_t *i)
 {
-    uint64_t ch = 0;
+    return ((c & 0xC0) != 0x80);
    int sz = 0;
    do {
        ch <<= 6;
        ch += static_cast<unsigned char>(s[(*i)++]);
        sz++;
    } while (s[*i] && !isutf(s[*i]));
    ch -= offsetsFromUTF8[sz-1];
    return ch;
 }
 /* number of characters */
 inline uint64_t u8_strlen(const char *s)
 {
    constexpr auto maxLength = std::numeric_limits<uint64_t>::max();
    uint64_t count = 0;
    uint64_t i = 0;
-    while (s[i] != 0 && u8_nextchar(s, &i) != 0) {
+    while (*s) {
-        if (i == maxLength) {
+        unsigned char p = static_cast<unsigned char>(*s);
-            throwRuntimeError(
+
-                    "String exceeded maximum size of " +
+        size_t seqLen = p < 0x80   ? 1  // 0xxxxxxx: 1-byte (ASCII)
-                    std::to_string(maxLength) + " bytes.");
+                        : p < 0xE0 ? 2  // 110xxxxx: 2-byte sequence
                        : p < 0xF0 ? 3  // 1110xxxx: 3-byte sequence
                        : p < 0xF8 ? 4  // 11110xxx: 4-byte sequence
                                   : 1; // treat as a single character
        for (size_t i = 1; i < seqLen; ++i) {
            if (s[i] == 0 || isutf(s[i])) {
                seqLen = i;
                break;
            }
        }
        s += seqLen;
        count++;
    }
--- a/tests/test_utf8_utils.cpp
+++ b/tests/test_utf8_utils.cpp
@ -0,0 +1,51 @@
 #include <gtest/gtest.h>
 #include <valijson/utils/utf8_utils.hpp>
 class TestUtf8Utils : public testing::Test
 {
 };
 TEST_F(TestUtf8Utils, Utf8StringLength)
 {
    using valijson::utils::u8_strlen;
    EXPECT_EQ(u8_strlen(""), 0);
    EXPECT_EQ(u8_strlen("a"), 1);
    EXPECT_EQ(u8_strlen("abc"), 3);
    // U+0416
    EXPECT_EQ(u8_strlen("\xD0\x96"), 1);
    // U+0915
    EXPECT_EQ(u8_strlen("\xE0\xA4\x95"), 1);
    // U+10348
    EXPECT_EQ(u8_strlen("\xF0\x90\x8D\x88"), 1);
    // U+0915 + U+0416
    EXPECT_EQ(u8_strlen("\xE0\xA4\x95\xD0\x96"), 2);
    // incomplete U+0416 at the end
    EXPECT_EQ(u8_strlen("\xD0"), 1);
    // incomplete U+0416 in the middle
    EXPECT_EQ(u8_strlen("\320abc"), 4);
    // incomplete U+0915 at the end
    EXPECT_EQ(u8_strlen("\xE0\xA4"), 1);
    // incomplete U+0915 at the end
    EXPECT_EQ(u8_strlen("\xE0\244abc"), 4);
    // U+DFFF
    EXPECT_EQ(u8_strlen("\xED\xBF\xBF"), 1);
    // Overlong encoding for U+0000
    EXPECT_EQ(u8_strlen("\xC0\x80"), 1);
    // U+110000 (out of Unicode range)
    EXPECT_EQ(u8_strlen("\xF5\x80\x80\x80"), 1);
    // 0xE0 + 0xA4 repeating 9 times
    EXPECT_EQ(u8_strlen("\340\244\244\244\244\244\244\244\244\244"), 5);
 }