mirror of
https://github.com/pocoproject/poco.git
synced 2025-10-15 07:14:46 +02:00
- [SF 2513643] Seg fault in Poco::UTF8::toLower on 64-bit Linux
- removed support for 5- and 6-byte sequences - fixed error counting in StreamConverterBuf::readFromDevice() - added std::dec to poco_stdout_dbg and poco_stderr_dbg macros
This commit is contained in:
@@ -129,7 +129,7 @@ protected:
|
|||||||
|
|
||||||
#if defined(_DEBUG)
|
#if defined(_DEBUG)
|
||||||
# define poco_stdout_dbg(outstr) \
|
# define poco_stdout_dbg(outstr) \
|
||||||
std::cout << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
|
std::cout << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
|
||||||
#else
|
#else
|
||||||
# define poco_stdout_dbg(outstr)
|
# define poco_stdout_dbg(outstr)
|
||||||
#endif
|
#endif
|
||||||
@@ -137,7 +137,7 @@ protected:
|
|||||||
|
|
||||||
#if defined(_DEBUG)
|
#if defined(_DEBUG)
|
||||||
# define poco_stderr_dbg(outstr) \
|
# define poco_stderr_dbg(outstr) \
|
||||||
std::cerr << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
|
std::cerr << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
|
||||||
#else
|
#else
|
||||||
# define poco_stderr_dbg(outstr)
|
# define poco_stderr_dbg(outstr)
|
||||||
#endif
|
#endif
|
||||||
|
@@ -59,6 +59,16 @@ public:
|
|||||||
int convert(const unsigned char* bytes) const;
|
int convert(const unsigned char* bytes) const;
|
||||||
int convert(int ch, unsigned char* bytes, int length) const;
|
int convert(int ch, unsigned char* bytes, int length) const;
|
||||||
|
|
||||||
|
static bool isLegal(const unsigned char *bytes, int length);
|
||||||
|
/// Utility routine to tell whether a sequence of bytes is legal UTF-8.
|
||||||
|
/// This must be called with the length pre-determined by the first byte.
|
||||||
|
/// The sequence is illegal right away if there aren't enough bytes
|
||||||
|
/// available. If presented with a length > 4, this function returns false.
|
||||||
|
/// The Unicode definition of UTF-8 goes up to 4-byte sequences.
|
||||||
|
///
|
||||||
|
/// Adapted from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
|
||||||
|
/// Copyright 2001-2004 Unicode, Inc.
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static const char* _names[];
|
static const char* _names[];
|
||||||
static const CharacterMap _charMap;
|
static const CharacterMap _charMap;
|
||||||
|
@@ -101,7 +101,11 @@ int StreamConverterBuf::readFromDevice()
|
|||||||
if (_pIstr->gcount() == -n - 1)
|
if (_pIstr->gcount() == -n - 1)
|
||||||
{
|
{
|
||||||
uc = _inEncoding.convert(_buffer);
|
uc = _inEncoding.convert(_buffer);
|
||||||
if (uc == -1) uc = _defaultChar;
|
if (uc == -1)
|
||||||
|
{
|
||||||
|
uc = _defaultChar;
|
||||||
|
++_errors;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@@ -103,28 +103,55 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
|
||||||
|
{
|
||||||
|
if (0 == bytes || 0 == length) return false;
|
||||||
|
|
||||||
|
unsigned char a;
|
||||||
|
const unsigned char* srcptr = bytes + length;
|
||||||
|
switch (length)
|
||||||
|
{
|
||||||
|
default: return false;
|
||||||
|
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
||||||
|
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
||||||
|
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
|
||||||
|
|
||||||
|
switch (*bytes)
|
||||||
|
{
|
||||||
|
case 0xE0: if (a < 0xA0) return false; break;
|
||||||
|
case 0xED: if (a > 0x9F) return false; break;
|
||||||
|
case 0xF0: if (a < 0x90) return false; break;
|
||||||
|
case 0xF4: if (a > 0x8F) return false; break;
|
||||||
|
default: if (a < 0x80) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*bytes > 0xF4) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int UTF8Encoding::convert(const unsigned char* bytes) const
|
int UTF8Encoding::convert(const unsigned char* bytes) const
|
||||||
{
|
{
|
||||||
int n = _charMap[*bytes];
|
int n = _charMap[*bytes];
|
||||||
int uc;
|
int uc;
|
||||||
|
|
||||||
switch (n)
|
switch (n)
|
||||||
{
|
{
|
||||||
case -6:
|
case -6: case -5: case -1: return -1;
|
||||||
uc = *bytes & 0x01; break;
|
|
||||||
case -5:
|
case -4: case -3: case -2:
|
||||||
uc = *bytes & 0x03; break;
|
if (!isLegal(bytes, -n)) return -1;
|
||||||
case -4:
|
uc = *bytes & ((0x07 << (n + 4)) | 0x03); break;
|
||||||
uc = *bytes & 0x07; break;
|
|
||||||
case -3:
|
default: return n;
|
||||||
uc = *bytes & 0x0F; break;
|
|
||||||
case -2:
|
|
||||||
uc = *bytes & 0x1F; break;
|
|
||||||
default:
|
|
||||||
uc = n;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while (n++ < -1)
|
while (n++ < -1)
|
||||||
{
|
{
|
||||||
// TODO: check for malformed or overlong sequences
|
|
||||||
uc <<= 6;
|
uc <<= 6;
|
||||||
uc |= (*++bytes & 0x3F);
|
uc |= (*++bytes & 0x3F);
|
||||||
}
|
}
|
||||||
@@ -134,6 +161,10 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
|
|||||||
|
|
||||||
int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
|
int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
|
||||||
{
|
{
|
||||||
|
#ifdef _DEBUG
|
||||||
|
unsigned char* lb = bytes;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (ch <= 0x7F)
|
if (ch <= 0x7F)
|
||||||
{
|
{
|
||||||
if (bytes && length >= 1)
|
if (bytes && length >= 1)
|
||||||
@@ -147,6 +178,7 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
|
|||||||
*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
|
*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
|
||||||
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
||||||
}
|
}
|
||||||
|
poco_assert_dbg (isLegal(lb, 2));
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
else if (ch <= 0xFFFF)
|
else if (ch <= 0xFFFF)
|
||||||
@@ -157,9 +189,10 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
|
|||||||
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
|
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
|
||||||
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
||||||
}
|
}
|
||||||
|
poco_assert_dbg (isLegal(lb, 3));
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
else if (ch <= 0x1FFFFF)
|
else if (ch <= 0x10FFFF)
|
||||||
{
|
{
|
||||||
if (bytes && length >= 4)
|
if (bytes && length >= 4)
|
||||||
{
|
{
|
||||||
@@ -168,33 +201,9 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
|
|||||||
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
|
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
|
||||||
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
||||||
}
|
}
|
||||||
|
poco_assert_dbg (isLegal(lb, 4));
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
else if (ch <= 0x3FFFFFF)
|
|
||||||
{
|
|
||||||
if (bytes && length >= 5)
|
|
||||||
{
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 24) & 0x03 | 0xF8);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
|
|
||||||
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
|
||||||
}
|
|
||||||
return 5;
|
|
||||||
}
|
|
||||||
else if (ch <= 0x7FFFFFFF)
|
|
||||||
{
|
|
||||||
if (bytes && length >= 6)
|
|
||||||
{
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 30) & 0x01 | 0xFC);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 24) & 0x3F | 0x80);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
|
|
||||||
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
|
|
||||||
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
|
|
||||||
}
|
|
||||||
return 6;
|
|
||||||
}
|
|
||||||
else return 0;
|
else return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -166,27 +166,13 @@ void TextIteratorTest::testOneUTF8()
|
|||||||
assert (*it++ == 0xabcde);
|
assert (*it++ == 0xabcde);
|
||||||
assert (it == end);
|
assert (it == end);
|
||||||
|
|
||||||
// 5 byte sequence
|
// 5 byte sequence - not supported
|
||||||
n = encoding.convert(0xabcdef, data, sizeof(data));
|
n = encoding.convert(0xabcdef, data, sizeof(data));
|
||||||
assert (n == 5);
|
assert (n == 0);
|
||||||
text.assign((char*) data, n);
|
|
||||||
it = TextIterator(text, encoding);
|
|
||||||
end = TextIterator(text);
|
|
||||||
|
|
||||||
assert (it != end);
|
|
||||||
assert (*it++ == 0xabcdef);
|
|
||||||
assert (it == end);
|
|
||||||
|
|
||||||
// 6 byte sequence
|
// 6 byte sequence - not supported
|
||||||
n = encoding.convert(0xfabcdef, data, sizeof(data));
|
n = encoding.convert(0xfabcdef, data, sizeof(data));
|
||||||
assert (n == 6);
|
assert (n == 0);
|
||||||
text.assign((char*) data, n);
|
|
||||||
it = TextIterator(text, encoding);
|
|
||||||
end = TextIterator(text);
|
|
||||||
|
|
||||||
assert (it != end);
|
|
||||||
assert (*it++ == 0xfabcdef);
|
|
||||||
assert (it == end);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -76,8 +76,8 @@ void UTF8StringTest::testCompare()
|
|||||||
|
|
||||||
assert (UTF8::icompare(a5, b5) < 0);
|
assert (UTF8::icompare(a5, b5) < 0);
|
||||||
|
|
||||||
std::string a6("\303\274\303\266\303\244"); // "u"a"o
|
std::string a6("\303\274\303\266\303\244"); // "u"o"a
|
||||||
std::string b6("\303\234\303\226\303\204"); // "U"A"O
|
std::string b6("\303\234\303\226\303\204"); // "U"O"A
|
||||||
|
|
||||||
assert (UTF8::icompare(a6, b6) == 0);
|
assert (UTF8::icompare(a6, b6) == 0);
|
||||||
}
|
}
|
||||||
@@ -93,11 +93,15 @@ void UTF8StringTest::testTransform()
|
|||||||
UTF8::toUpperInPlace(s2);
|
UTF8::toUpperInPlace(s2);
|
||||||
assert (s2 == "ABCDE123");
|
assert (s2 == "ABCDE123");
|
||||||
|
|
||||||
std::string s3("\303\274\303\266\303\244"); // "u"a"o
|
std::string s3("\303\274\303\266\303\244"); // "u"o"a
|
||||||
UTF8::toUpperInPlace(s3);
|
UTF8::toUpperInPlace(s3);
|
||||||
assert (s3 == "\303\234\303\226\303\204"); // "U"A"O
|
assert (s3 == "\303\234\303\226\303\204"); // "U"O"A
|
||||||
UTF8::toLowerInPlace(s3);
|
UTF8::toLowerInPlace(s3);
|
||||||
assert (s3 == "\303\274\303\266\303\244"); // "U"A"O
|
assert (s3 == "\303\274\303\266\303\244"); // "u"o"a
|
||||||
|
|
||||||
|
// a mix of invalid sequences
|
||||||
|
std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++";
|
||||||
|
assert ("???" == UTF8::toLower(str));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user