diff --git a/CMakeLists.txt b/CMakeLists.txt index fb0ffa3..f166159 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,7 @@ if(valijson_BUILD_TESTS) tests/test_validator.cpp tests/test_validator_with_custom_regular_expression_engine.cpp tests/test_yaml_cpp_adapter.cpp + tests/test_utf8_utils.cpp ) set(TEST_LIBS gtest gtest_main jsoncpp json11 yamlcpp) diff --git a/include/valijson/utils/utf8_utils.hpp b/include/valijson/utils/utf8_utils.hpp index f1e01a5..74c49d0 100644 --- a/include/valijson/utils/utf8_utils.hpp +++ b/include/valijson/utils/utf8_utils.hpp @@ -14,50 +14,39 @@ namespace valijson { namespace utils { -static const uint32_t offsetsFromUTF8[6] = { - 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL -}; - /* is c the start of a utf8 sequence? */ -inline bool isutf(char c) { - return ((c & 0xC0) != 0x80); -} - -/* reads the next utf-8 sequence out of a string, updating an index */ -inline uint64_t u8_nextchar(const char *s, uint64_t *i) +inline bool isutf(char c) { - uint64_t ch = 0; - int sz = 0; - - do { - ch <<= 6; - ch += static_cast(s[(*i)++]); - sz++; - } while (s[*i] && !isutf(s[*i])); - ch -= offsetsFromUTF8[sz-1]; - - return ch; + return ((c & 0xC0) != 0x80); } /* number of characters */ inline uint64_t u8_strlen(const char *s) { - constexpr auto maxLength = std::numeric_limits::max(); uint64_t count = 0; - uint64_t i = 0; - while (s[i] != 0 && u8_nextchar(s, &i) != 0) { - if (i == maxLength) { - throwRuntimeError( - "String exceeded maximum size of " + - std::to_string(maxLength) + " bytes."); + while (*s) { + unsigned char p = static_cast(*s); + + size_t seqLen = p < 0x80 ? 1 // 0xxxxxxx: 1-byte (ASCII) + : p < 0xE0 ? 2 // 110xxxxx: 2-byte sequence + : p < 0xF0 ? 3 // 1110xxxx: 3-byte sequence + : p < 0xF8 ? 4 // 11110xxx: 4-byte sequence + : 1; // treat as a single character + + for (size_t i = 1; i < seqLen; ++i) { + if (s[i] == 0 || isutf(s[i])) { + seqLen = i; + break; + } } + + s += seqLen; count++; } return count; } -} // namespace utils -} // namespace valijson +} // namespace utils +} // namespace valijson diff --git a/tests/test_utf8_utils.cpp b/tests/test_utf8_utils.cpp new file mode 100644 index 0000000..5001243 --- /dev/null +++ b/tests/test_utf8_utils.cpp @@ -0,0 +1,51 @@ +#include +#include + +class TestUtf8Utils : public testing::Test +{ +}; + +TEST_F(TestUtf8Utils, Utf8StringLength) +{ + using valijson::utils::u8_strlen; + + EXPECT_EQ(u8_strlen(""), 0); + EXPECT_EQ(u8_strlen("a"), 1); + EXPECT_EQ(u8_strlen("abc"), 3); + + // U+0416 + EXPECT_EQ(u8_strlen("\xD0\x96"), 1); + + // U+0915 + EXPECT_EQ(u8_strlen("\xE0\xA4\x95"), 1); + + // U+10348 + EXPECT_EQ(u8_strlen("\xF0\x90\x8D\x88"), 1); + + // U+0915 + U+0416 + EXPECT_EQ(u8_strlen("\xE0\xA4\x95\xD0\x96"), 2); + + // incomplete U+0416 at the end + EXPECT_EQ(u8_strlen("\xD0"), 1); + + // incomplete U+0416 in the middle + EXPECT_EQ(u8_strlen("\320abc"), 4); + + // incomplete U+0915 at the end + EXPECT_EQ(u8_strlen("\xE0\xA4"), 1); + + // incomplete U+0915 at the end + EXPECT_EQ(u8_strlen("\xE0\244abc"), 4); + + // U+DFFF + EXPECT_EQ(u8_strlen("\xED\xBF\xBF"), 1); + + // Overlong encoding for U+0000 + EXPECT_EQ(u8_strlen("\xC0\x80"), 1); + + // U+110000 (out of Unicode range) + EXPECT_EQ(u8_strlen("\xF5\x80\x80\x80"), 1); + + // 0xE0 + 0xA4 repeating 9 times + EXPECT_EQ(u8_strlen("\340\244\244\244\244\244\244\244\244\244"), 5); +}