- [SF 2513643] Seg fault in Poco::UTF8::toLower on 64-bit Linux

- removed support for 5- and 6-byte sequences
- fixed error counting in StreamConverterBuf::readFromDevice()
- added std::dec to poco_stdout_dbg and poco_stderr_dbg macros
This commit is contained in:
Aleksandar Fabijanic
2009-04-01 02:33:51 +00:00
parent 7007646ea2
commit d77ef57588
6 changed files with 78 additions and 65 deletions

View File

@@ -129,7 +129,7 @@ protected:
#if defined(_DEBUG)
# define poco_stdout_dbg(outstr) \
std::cout << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
std::cout << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
#else
# define poco_stdout_dbg(outstr)
#endif
@@ -137,7 +137,7 @@ protected:
#if defined(_DEBUG)
# define poco_stderr_dbg(outstr) \
std::cerr << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
std::cerr << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
#else
# define poco_stderr_dbg(outstr)
#endif

View File

@@ -59,6 +59,16 @@ public:
int convert(const unsigned char* bytes) const;
int convert(int ch, unsigned char* bytes, int length) const;
static bool isLegal(const unsigned char *bytes, int length);
/// Utility routine to tell whether a sequence of bytes is legal UTF-8.
/// This must be called with the length pre-determined by the first byte.
/// The sequence is illegal right away if there aren't enough bytes
/// available. If presented with a length > 4, this function returns false.
/// The Unicode definition of UTF-8 goes up to 4-byte sequences.
///
/// Adapted from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
/// Copyright 2001-2004 Unicode, Inc.
private:
static const char* _names[];
static const CharacterMap _charMap;

View File

@@ -101,7 +101,11 @@ int StreamConverterBuf::readFromDevice()
if (_pIstr->gcount() == -n - 1)
{
uc = _inEncoding.convert(_buffer);
if (uc == -1) uc = _defaultChar;
if (uc == -1)
{
uc = _defaultChar;
++_errors;
}
}
else
{

View File

@@ -103,28 +103,55 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
}
bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
{
if (0 == bytes || 0 == length) return false;
unsigned char a;
const unsigned char* srcptr = bytes + length;
switch (length)
{
default: return false;
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*bytes)
{
case 0xE0: if (a < 0xA0) return false; break;
case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false;
}
if (*bytes > 0xF4) return false;
return true;
}
int UTF8Encoding::convert(const unsigned char* bytes) const
{
int n = _charMap[*bytes];
int uc;
switch (n)
{
case -6:
uc = *bytes & 0x01; break;
case -5:
uc = *bytes & 0x03; break;
case -4:
uc = *bytes & 0x07; break;
case -3:
uc = *bytes & 0x0F; break;
case -2:
uc = *bytes & 0x1F; break;
default:
uc = n;
case -6: case -5: case -1: return -1;
case -4: case -3: case -2:
if (!isLegal(bytes, -n)) return -1;
uc = *bytes & ((0x07 << (n + 4)) | 0x03); break;
default: return n;
}
while (n++ < -1)
{
// TODO: check for malformed or overlong sequences
uc <<= 6;
uc |= (*++bytes & 0x3F);
}
@@ -134,6 +161,10 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
{
#ifdef _DEBUG
unsigned char* lb = bytes;
#endif
if (ch <= 0x7F)
{
if (bytes && length >= 1)
@@ -147,6 +178,7 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
poco_assert_dbg (isLegal(lb, 2));
return 2;
}
else if (ch <= 0xFFFF)
@@ -157,9 +189,10 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
poco_assert_dbg (isLegal(lb, 3));
return 3;
}
else if (ch <= 0x1FFFFF)
else if (ch <= 0x10FFFF)
{
if (bytes && length >= 4)
{
@@ -168,33 +201,9 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
poco_assert_dbg (isLegal(lb, 4));
return 4;
}
else if (ch <= 0x3FFFFFF)
{
if (bytes && length >= 5)
{
*bytes++ = (unsigned char) ((ch >> 24) & 0x03 | 0xF8);
*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
return 5;
}
else if (ch <= 0x7FFFFFFF)
{
if (bytes && length >= 6)
{
*bytes++ = (unsigned char) ((ch >> 30) & 0x01 | 0xFC);
*bytes++ = (unsigned char) ((ch >> 24) & 0x3F | 0x80);
*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
return 6;
}
else return 0;
}

View File

@@ -166,27 +166,13 @@ void TextIteratorTest::testOneUTF8()
assert (*it++ == 0xabcde);
assert (it == end);
// 5 byte sequence
// 5 byte sequence - not supported
n = encoding.convert(0xabcdef, data, sizeof(data));
assert (n == 5);
text.assign((char*) data, n);
it = TextIterator(text, encoding);
end = TextIterator(text);
assert (n == 0);
assert (it != end);
assert (*it++ == 0xabcdef);
assert (it == end);
// 6 byte sequence
// 6 byte sequence - not supported
n = encoding.convert(0xfabcdef, data, sizeof(data));
assert (n == 6);
text.assign((char*) data, n);
it = TextIterator(text, encoding);
end = TextIterator(text);
assert (it != end);
assert (*it++ == 0xfabcdef);
assert (it == end);
assert (n == 0);
}

View File

@@ -76,8 +76,8 @@ void UTF8StringTest::testCompare()
assert (UTF8::icompare(a5, b5) < 0);
std::string a6("\303\274\303\266\303\244"); // "u"a"o
std::string b6("\303\234\303\226\303\204"); // "U"A"O
std::string a6("\303\274\303\266\303\244"); // "u"o"a
std::string b6("\303\234\303\226\303\204"); // "U"O"A
assert (UTF8::icompare(a6, b6) == 0);
}
@@ -93,11 +93,15 @@ void UTF8StringTest::testTransform()
UTF8::toUpperInPlace(s2);
assert (s2 == "ABCDE123");
std::string s3("\303\274\303\266\303\244"); // "u"a"o
std::string s3("\303\274\303\266\303\244"); // "u"o"a
UTF8::toUpperInPlace(s3);
assert (s3 == "\303\234\303\226\303\204"); // "U"A"O
assert (s3 == "\303\234\303\226\303\204"); // "U"O"A
UTF8::toLowerInPlace(s3);
assert (s3 == "\303\274\303\266\303\244"); // "U"A"O
assert (s3 == "\303\274\303\266\303\244"); // "u"o"a
// a mix of invalid sequences
std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++";
assert ("???" == UTF8::toLower(str));
}