diff --git a/Foundation/include/Poco/Bugcheck.h b/Foundation/include/Poco/Bugcheck.h index 8b4f42c4a..741a68285 100644 --- a/Foundation/include/Poco/Bugcheck.h +++ b/Foundation/include/Poco/Bugcheck.h @@ -129,7 +129,7 @@ protected: #if defined(_DEBUG) # define poco_stdout_dbg(outstr) \ - std::cout << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl; + std::cout << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl; #else # define poco_stdout_dbg(outstr) #endif @@ -137,7 +137,7 @@ protected: #if defined(_DEBUG) # define poco_stderr_dbg(outstr) \ - std::cerr << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl; + std::cerr << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl; #else # define poco_stderr_dbg(outstr) #endif diff --git a/Foundation/include/Poco/UTF8Encoding.h b/Foundation/include/Poco/UTF8Encoding.h index 457e76b37..23fb0c7ae 100644 --- a/Foundation/include/Poco/UTF8Encoding.h +++ b/Foundation/include/Poco/UTF8Encoding.h @@ -59,6 +59,16 @@ public: int convert(const unsigned char* bytes) const; int convert(int ch, unsigned char* bytes, int length) const; + static bool isLegal(const unsigned char *bytes, int length); + /// Utility routine to tell whether a sequence of bytes is legal UTF-8. + /// This must be called with the length pre-determined by the first byte. + /// The sequence is illegal right away if there aren't enough bytes + /// available. If presented with a length > 4, this function returns false. + /// The Unicode definition of UTF-8 goes up to 4-byte sequences. + /// + /// Adapted from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c + /// Copyright 2001-2004 Unicode, Inc. + private: static const char* _names[]; static const CharacterMap _charMap; diff --git a/Foundation/src/StreamConverter.cpp b/Foundation/src/StreamConverter.cpp index d90d655e0..9e58f4c85 100644 --- a/Foundation/src/StreamConverter.cpp +++ b/Foundation/src/StreamConverter.cpp @@ -101,7 +101,11 @@ int StreamConverterBuf::readFromDevice() if (_pIstr->gcount() == -n - 1) { uc = _inEncoding.convert(_buffer); - if (uc == -1) uc = _defaultChar; + if (uc == -1) + { + uc = _defaultChar; + ++_errors; + } } else { diff --git a/Foundation/src/UTF8Encoding.cpp b/Foundation/src/UTF8Encoding.cpp index 4628b59f1..af06e31bf 100644 --- a/Foundation/src/UTF8Encoding.cpp +++ b/Foundation/src/UTF8Encoding.cpp @@ -103,28 +103,55 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const } +bool UTF8Encoding::isLegal(const unsigned char *bytes, int length) +{ + if (0 == bytes || 0 == length) return false; + + unsigned char a; + const unsigned char* srcptr = bytes + length; + switch (length) + { + default: return false; + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*bytes) + { + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false; + } + + if (*bytes > 0xF4) return false; + + return true; +} + + int UTF8Encoding::convert(const unsigned char* bytes) const { int n = _charMap[*bytes]; int uc; + switch (n) { - case -6: - uc = *bytes & 0x01; break; - case -5: - uc = *bytes & 0x03; break; - case -4: - uc = *bytes & 0x07; break; - case -3: - uc = *bytes & 0x0F; break; - case -2: - uc = *bytes & 0x1F; break; - default: - uc = n; + case -6: case -5: case -1: return -1; + + case -4: case -3: case -2: + if (!isLegal(bytes, -n)) return -1; + uc = *bytes & ((0x07 << (n + 4)) | 0x03); break; + + default: return n; } + while (n++ < -1) { - // TODO: check for malformed or overlong sequences uc <<= 6; uc |= (*++bytes & 0x3F); } @@ -134,6 +161,10 @@ int UTF8Encoding::convert(const unsigned char* bytes) const int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const { +#ifdef _DEBUG + unsigned char* lb = bytes; +#endif + if (ch <= 0x7F) { if (bytes && length >= 1) @@ -147,6 +178,7 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const *bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0); *bytes = (unsigned char) ((ch & 0x3F) | 0x80); } + poco_assert_dbg (isLegal(lb, 2)); return 2; } else if (ch <= 0xFFFF) @@ -157,9 +189,10 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const *bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80); *bytes = (unsigned char) ((ch & 0x3F) | 0x80); } + poco_assert_dbg (isLegal(lb, 3)); return 3; } - else if (ch <= 0x1FFFFF) + else if (ch <= 0x10FFFF) { if (bytes && length >= 4) { @@ -168,33 +201,9 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const *bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80); *bytes = (unsigned char) ((ch & 0x3F) | 0x80); } + poco_assert_dbg (isLegal(lb, 4)); return 4; } - else if (ch <= 0x3FFFFFF) - { - if (bytes && length >= 5) - { - *bytes++ = (unsigned char) ((ch >> 24) & 0x03 | 0xF8); - *bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80); - *bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80); - *bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80); - *bytes = (unsigned char) ((ch & 0x3F) | 0x80); - } - return 5; - } - else if (ch <= 0x7FFFFFFF) - { - if (bytes && length >= 6) - { - *bytes++ = (unsigned char) ((ch >> 30) & 0x01 | 0xFC); - *bytes++ = (unsigned char) ((ch >> 24) & 0x3F | 0x80); - *bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80); - *bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80); - *bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80); - *bytes = (unsigned char) ((ch & 0x3F) | 0x80); - } - return 6; - } else return 0; } diff --git a/Foundation/testsuite/src/TextIteratorTest.cpp b/Foundation/testsuite/src/TextIteratorTest.cpp index 3dc638f6b..30d85c317 100644 --- a/Foundation/testsuite/src/TextIteratorTest.cpp +++ b/Foundation/testsuite/src/TextIteratorTest.cpp @@ -166,27 +166,13 @@ void TextIteratorTest::testOneUTF8() assert (*it++ == 0xabcde); assert (it == end); - // 5 byte sequence + // 5 byte sequence - not supported n = encoding.convert(0xabcdef, data, sizeof(data)); - assert (n == 5); - text.assign((char*) data, n); - it = TextIterator(text, encoding); - end = TextIterator(text); - - assert (it != end); - assert (*it++ == 0xabcdef); - assert (it == end); + assert (n == 0); - // 6 byte sequence + // 6 byte sequence - not supported n = encoding.convert(0xfabcdef, data, sizeof(data)); - assert (n == 6); - text.assign((char*) data, n); - it = TextIterator(text, encoding); - end = TextIterator(text); - - assert (it != end); - assert (*it++ == 0xfabcdef); - assert (it == end); + assert (n == 0); } diff --git a/Foundation/testsuite/src/UTF8StringTest.cpp b/Foundation/testsuite/src/UTF8StringTest.cpp index 5c2c82f71..5d7f51418 100644 --- a/Foundation/testsuite/src/UTF8StringTest.cpp +++ b/Foundation/testsuite/src/UTF8StringTest.cpp @@ -76,8 +76,8 @@ void UTF8StringTest::testCompare() assert (UTF8::icompare(a5, b5) < 0); - std::string a6("\303\274\303\266\303\244"); // "u"a"o - std::string b6("\303\234\303\226\303\204"); // "U"A"O + std::string a6("\303\274\303\266\303\244"); // "u"o"a + std::string b6("\303\234\303\226\303\204"); // "U"O"A assert (UTF8::icompare(a6, b6) == 0); } @@ -93,11 +93,15 @@ void UTF8StringTest::testTransform() UTF8::toUpperInPlace(s2); assert (s2 == "ABCDE123"); - std::string s3("\303\274\303\266\303\244"); // "u"a"o + std::string s3("\303\274\303\266\303\244"); // "u"o"a UTF8::toUpperInPlace(s3); - assert (s3 == "\303\234\303\226\303\204"); // "U"A"O + assert (s3 == "\303\234\303\226\303\204"); // "U"O"A UTF8::toLowerInPlace(s3); - assert (s3 == "\303\274\303\266\303\244"); // "U"A"O + assert (s3 == "\303\274\303\266\303\244"); // "u"o"a + + // a mix of invalid sequences + std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++"; + assert ("???" == UTF8::toLower(str)); }