- [SF 2513643] Seg fault in Poco::UTF8::toLower on 64-bit Linux

- removed support for 5- and 6-byte sequences - fixed error counting in StreamConverterBuf::readFromDevice() - added std::dec to poco_stdout_dbg and poco_stderr_dbg macros
2025-10-15 07:14:46 +02:00 · 2009-04-01 02:33:51 +00:00
parent 7007646ea2
commit d77ef57588
6 changed files with 78 additions and 65 deletions
--- a/Foundation/include/Poco/Bugcheck.h
+++ b/Foundation/include/Poco/Bugcheck.h
@@ -129,7 +129,7 @@ protected:
 #if defined(_DEBUG)
 #	define poco_stdout_dbg(outstr) \
-		std::cout << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
+		std::cout << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
 #else
 #	define poco_stdout_dbg(outstr)
 #endif
@@ -137,7 +137,7 @@ protected:
 #if defined(_DEBUG)
 #	define poco_stderr_dbg(outstr) \
-		std::cerr << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
+		std::cerr << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
 #else
 #	define poco_stderr_dbg(outstr)
 #endif
--- a/Foundation/include/Poco/UTF8Encoding.h
+++ b/Foundation/include/Poco/UTF8Encoding.h
@@ -59,6 +59,16 @@ public:
 	int convert(const unsigned char* bytes) const;
 	int convert(int ch, unsigned char* bytes, int length) const;
 	static bool isLegal(const unsigned char *bytes, int length);
 		/// Utility routine to tell whether a sequence of bytes is legal UTF-8.
 		/// This must be called with the length pre-determined by the first byte.
 		/// The sequence is illegal right away if there aren't enough bytes 
 		/// available. If presented with a length > 4, this function returns false.
 		/// The Unicode definition of UTF-8 goes up to 4-byte sequences.
 		/// 
 		/// Adapted from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 		/// Copyright 2001-2004 Unicode, Inc.
 private:
 	static const char* _names[];
 	static const CharacterMap _charMap;
--- a/Foundation/src/StreamConverter.cpp
+++ b/Foundation/src/StreamConverter.cpp
@@ -101,7 +101,11 @@ int StreamConverterBuf::readFromDevice()
 		if (_pIstr->gcount() == -n - 1)
 		{
 			uc = _inEncoding.convert(_buffer);
-			if (uc == -1) uc = _defaultChar;
+			if (uc == -1)
 			{
 				uc = _defaultChar;
 				++_errors;
 			}
 		}
 		else
 		{
--- a/Foundation/src/UTF8Encoding.cpp
+++ b/Foundation/src/UTF8Encoding.cpp
@@ -103,28 +103,55 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
 }
 bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
 {
 	if (0 == bytes || 0 == length) return false;
    unsigned char a;
    const unsigned char* srcptr = bytes + length;
    switch (length)
 	{
 		default: return false;
 		case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 		case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 		case 2: if ((a = (*--srcptr)) > 0xBF) return false;
 			switch (*bytes) 
 			{
 				case 0xE0: if (a < 0xA0) return false; break;
 				case 0xED: if (a > 0x9F) return false; break;
 				case 0xF0: if (a < 0x90) return false; break;
 				case 0xF4: if (a > 0x8F) return false; break;
 				default:   if (a < 0x80) return false;
 			}
 		case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false;
    }
 	if (*bytes > 0xF4) return false;
 	return true;
 }
 int UTF8Encoding::convert(const unsigned char* bytes) const
 {
 	int n = _charMap[*bytes];
 	int uc;
 	switch (n)
 	{
-	case -6:
+		case -6: case -5: case -1: return -1;
-		uc = *bytes & 0x01; break;
+
-	case -5:
+		case -4: case -3: case -2:
-		uc = *bytes & 0x03; break;
+			if (!isLegal(bytes, -n)) return -1;
-	case -4:
+			uc = *bytes & ((0x07 << (n + 4)) | 0x03); break;
-		uc = *bytes & 0x07; break;
+
-	case -3:
+		default: return n;
 		uc = *bytes & 0x0F; break;
 	case -2:
 		uc = *bytes & 0x1F; break;
 	default:
 		uc = n;
 	}
 	while (n++ < -1) 
 	{	
 		// TODO: check for malformed or overlong sequences
 		uc <<= 6;
 		uc |= (*++bytes & 0x3F);
 	}
@@ -134,6 +161,10 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
 int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 {
 #ifdef _DEBUG
 	unsigned char* lb = bytes;
 #endif
 	if (ch <= 0x7F)
 	{
 		if (bytes && length >= 1)
@@ -147,6 +178,7 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
 		poco_assert_dbg (isLegal(lb, 2));
 		return 2;
 	}
 	else if (ch <= 0xFFFF)
@@ -157,9 +189,10 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
 		poco_assert_dbg (isLegal(lb, 3));
 		return 3;
 	}
-	else if (ch <= 0x1FFFFF)
+	else if (ch <= 0x10FFFF)
 	{
 		if (bytes && length >= 4)
 		{
@@ -168,33 +201,9 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
 		poco_assert_dbg (isLegal(lb, 4));
 		return 4;
 	}
 	else if (ch <= 0x3FFFFFF)
 	{
 		if (bytes && length >= 5)
 		{
 			*bytes++ = (unsigned char) ((ch >> 24) & 0x03 | 0xF8);
 			*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
 			*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
 		return 5;
 	}
 	else if (ch <= 0x7FFFFFFF)
 	{
 		if (bytes && length >= 6)
 		{
 			*bytes++ = (unsigned char) ((ch >> 30) & 0x01 | 0xFC);
 			*bytes++ = (unsigned char) ((ch >> 24) & 0x3F | 0x80);
 			*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
 			*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
 		return 6;
 	}
 	else return 0;
 }
--- a/Foundation/testsuite/src/TextIteratorTest.cpp
+++ b/Foundation/testsuite/src/TextIteratorTest.cpp
@@ -166,27 +166,13 @@ void TextIteratorTest::testOneUTF8()
 	assert (*it++ == 0xabcde);
 	assert (it == end);
-	// 5 byte sequence
+	// 5 byte sequence - not supported
 	n = encoding.convert(0xabcdef, data, sizeof(data));
-	assert (n == 5);
+	assert (n == 0);
 	text.assign((char*) data, n);
 	it  = TextIterator(text, encoding);
 	end = TextIterator(text);
 	assert (it != end);
 	assert (*it++ == 0xabcdef);
 	assert (it == end);
-	// 6 byte sequence
+	// 6 byte sequence - not supported
 	n = encoding.convert(0xfabcdef, data, sizeof(data));
-	assert (n == 6);
+	assert (n == 0);
 	text.assign((char*) data, n);
 	it  = TextIterator(text, encoding);
 	end = TextIterator(text);
 	assert (it != end);
 	assert (*it++ == 0xfabcdef);
 	assert (it == end);
 }
--- a/Foundation/testsuite/src/UTF8StringTest.cpp
+++ b/Foundation/testsuite/src/UTF8StringTest.cpp
@@ -76,8 +76,8 @@ void UTF8StringTest::testCompare()
 	assert (UTF8::icompare(a5, b5) < 0);
-	std::string a6("\303\274\303\266\303\244"); // "u"a"o
+	std::string a6("\303\274\303\266\303\244"); // "u"o"a
-	std::string b6("\303\234\303\226\303\204"); // "U"A"O
+	std::string b6("\303\234\303\226\303\204"); // "U"O"A
 	assert (UTF8::icompare(a6, b6) == 0);
 }
@@ -93,11 +93,15 @@ void UTF8StringTest::testTransform()
 	UTF8::toUpperInPlace(s2);
 	assert (s2 == "ABCDE123");
-	std::string s3("\303\274\303\266\303\244"); // "u"a"o
+	std::string s3("\303\274\303\266\303\244"); // "u"o"a
 	UTF8::toUpperInPlace(s3);	
-	assert (s3 == "\303\234\303\226\303\204"); // "U"A"O
+	assert (s3 == "\303\234\303\226\303\204"); // "U"O"A
 	UTF8::toLowerInPlace(s3);
-	assert (s3 == "\303\274\303\266\303\244"); // "U"A"O
+	assert (s3 == "\303\274\303\266\303\244"); // "u"o"a
 	// a mix of invalid sequences
 	std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++";
 	assert ("???" == UTF8::toLower(str));
 }