- [SF 2513643] Seg fault in Poco::UTF8::toLower on 64-bit Linux

- removed support for 5- and 6-byte sequences - fixed error counting in StreamConverterBuf::readFromDevice() - added std::dec to poco_stdout_dbg and poco_stderr_dbg macros
2025-10-15 07:14:46 +02:00 · 2009-04-01 02:33:51 +00:00
parent 7007646ea2
commit d77ef57588
6 changed files with 78 additions and 65 deletions
--- a/Foundation/include/Poco/Bugcheck.h
+++ b/Foundation/include/Poco/Bugcheck.h
@@ -129,7 +129,7 @@ protected:

 #if defined(_DEBUG)
 #	define poco_stdout_dbg(outstr) \
-		std::cout << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
+		std::cout << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
 #else
 #	define poco_stdout_dbg(outstr)
 #endif
@@ -137,7 +137,7 @@ protected:

 #if defined(_DEBUG)
 #	define poco_stderr_dbg(outstr) \
-		std::cerr << __FILE__ << '(' << __LINE__ << "):" << outstr << std::endl;
+		std::cerr << __FILE__ << '(' << std::dec << __LINE__ << "):" << outstr << std::endl;
 #else
 #	define poco_stderr_dbg(outstr)
 #endif
--- a/Foundation/include/Poco/UTF8Encoding.h
+++ b/Foundation/include/Poco/UTF8Encoding.h
@@ -59,6 +59,16 @@ public:
 	int convert(const unsigned char* bytes) const;
 	int convert(int ch, unsigned char* bytes, int length) const;
 	
+	static bool isLegal(const unsigned char *bytes, int length);
+		/// Utility routine to tell whether a sequence of bytes is legal UTF-8.
+		/// This must be called with the length pre-determined by the first byte.
+		/// The sequence is illegal right away if there aren't enough bytes 
+		/// available. If presented with a length > 4, this function returns false.
+		/// The Unicode definition of UTF-8 goes up to 4-byte sequences.
+		/// 
+		/// Adapted from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
+		/// Copyright 2001-2004 Unicode, Inc.
+
 private:
 	static const char* _names[];
 	static const CharacterMap _charMap;
--- a/Foundation/src/StreamConverter.cpp
+++ b/Foundation/src/StreamConverter.cpp
@@ -101,7 +101,11 @@ int StreamConverterBuf::readFromDevice()
 		if (_pIstr->gcount() == -n - 1)
 		{
 			uc = _inEncoding.convert(_buffer);
-			if (uc == -1) uc = _defaultChar;
+			if (uc == -1)
+			{
+				uc = _defaultChar;
+				++_errors;
+			}
 		}
 		else
 		{
--- a/Foundation/src/UTF8Encoding.cpp
+++ b/Foundation/src/UTF8Encoding.cpp
@@ -103,28 +103,55 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
 }


+bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
+{
+	if (0 == bytes || 0 == length) return false;
+
+    unsigned char a;
+    const unsigned char* srcptr = bytes + length;
+    switch (length)
+	{
+		default: return false;
+		case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+		case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+		case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
+			switch (*bytes) 
+			{
+				case 0xE0: if (a < 0xA0) return false; break;
+				case 0xED: if (a > 0x9F) return false; break;
+				case 0xF0: if (a < 0x90) return false; break;
+				case 0xF4: if (a > 0x8F) return false; break;
+				default:   if (a < 0x80) return false;
+			}
+			
+		case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false;
+    }
+
+	if (*bytes > 0xF4) return false;
+
+	return true;
+}
+
+
 int UTF8Encoding::convert(const unsigned char* bytes) const
 {
 	int n = _charMap[*bytes];
 	int uc;
+	
 	switch (n)
 	{
-	case -6:
-		uc = *bytes & 0x01; break;
-	case -5:
-		uc = *bytes & 0x03; break;
-	case -4:
-		uc = *bytes & 0x07; break;
-	case -3:
-		uc = *bytes & 0x0F; break;
-	case -2:
-		uc = *bytes & 0x1F; break;
-	default:
-		uc = n;
+		case -6: case -5: case -1: return -1;
+
+		case -4: case -3: case -2:
+			if (!isLegal(bytes, -n)) return -1;
+			uc = *bytes & ((0x07 << (n + 4)) | 0x03); break;
+
+		default: return n;
 	}
+
 	while (n++ < -1) 
 	{	
-		// TODO: check for malformed or overlong sequences
 		uc <<= 6;
 		uc |= (*++bytes & 0x3F);
 	}
@@ -134,6 +161,10 @@ int UTF8Encoding::convert(const unsigned char* bytes) const

 int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 {
+#ifdef _DEBUG
+	unsigned char* lb = bytes;
+#endif
+
 	if (ch <= 0x7F)
 	{
 		if (bytes && length >= 1)
@@ -147,6 +178,7 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
+		poco_assert_dbg (isLegal(lb, 2));
 		return 2;
 	}
 	else if (ch <= 0xFFFF)
@@ -157,9 +189,10 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
+		poco_assert_dbg (isLegal(lb, 3));
 		return 3;
 	}
-	else if (ch <= 0x1FFFFF)
+	else if (ch <= 0x10FFFF)
 	{
 		if (bytes && length >= 4)
 		{
@@ -168,33 +201,9 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
+		poco_assert_dbg (isLegal(lb, 4));
 		return 4;
 	}
-	else if (ch <= 0x3FFFFFF)
-	{
-		if (bytes && length >= 5)
-		{
-			*bytes++ = (unsigned char) ((ch >> 24) & 0x03 | 0xF8);
-			*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
-			*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
-			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
-			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
-		}
-		return 5;
-	}
-	else if (ch <= 0x7FFFFFFF)
-	{
-		if (bytes && length >= 6)
-		{
-			*bytes++ = (unsigned char) ((ch >> 30) & 0x01 | 0xFC);
-			*bytes++ = (unsigned char) ((ch >> 24) & 0x3F | 0x80);
-			*bytes++ = (unsigned char) ((ch >> 18) & 0x3F | 0x80);
-			*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
-			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
-			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
-		}
-		return 6;
-	}
 	else return 0;
 }

--- a/Foundation/testsuite/src/TextIteratorTest.cpp
+++ b/Foundation/testsuite/src/TextIteratorTest.cpp
@@ -166,27 +166,13 @@ void TextIteratorTest::testOneUTF8()
 	assert (*it++ == 0xabcde);
 	assert (it == end);
 	
-	// 5 byte sequence
+	// 5 byte sequence - not supported
 	n = encoding.convert(0xabcdef, data, sizeof(data));
-	assert (n == 5);
-	text.assign((char*) data, n);
-	it  = TextIterator(text, encoding);
-	end = TextIterator(text);
+	assert (n == 0);

-	assert (it != end);
-	assert (*it++ == 0xabcdef);
-	assert (it == end);
-
-	// 6 byte sequence
+	// 6 byte sequence - not supported
 	n = encoding.convert(0xfabcdef, data, sizeof(data));
-	assert (n == 6);
-	text.assign((char*) data, n);
-	it  = TextIterator(text, encoding);
-	end = TextIterator(text);
-	
-	assert (it != end);
-	assert (*it++ == 0xfabcdef);
-	assert (it == end);
+	assert (n == 0);
 }


--- a/Foundation/testsuite/src/UTF8StringTest.cpp
+++ b/Foundation/testsuite/src/UTF8StringTest.cpp
@@ -76,8 +76,8 @@ void UTF8StringTest::testCompare()
 	
 	assert (UTF8::icompare(a5, b5) < 0);

-	std::string a6("\303\274\303\266\303\244"); // "u"a"o
-	std::string b6("\303\234\303\226\303\204"); // "U"A"O
+	std::string a6("\303\274\303\266\303\244"); // "u"o"a
+	std::string b6("\303\234\303\226\303\204"); // "U"O"A
 	
 	assert (UTF8::icompare(a6, b6) == 0);
 }
@@ -93,11 +93,15 @@ void UTF8StringTest::testTransform()
 	UTF8::toUpperInPlace(s2);
 	assert (s2 == "ABCDE123");

-	std::string s3("\303\274\303\266\303\244"); // "u"a"o
+	std::string s3("\303\274\303\266\303\244"); // "u"o"a
 	UTF8::toUpperInPlace(s3);	
-	assert (s3 == "\303\234\303\226\303\204"); // "U"A"O
+	assert (s3 == "\303\234\303\226\303\204"); // "U"O"A
 	UTF8::toLowerInPlace(s3);
-	assert (s3 == "\303\274\303\266\303\244"); // "U"A"O
+	assert (s3 == "\303\274\303\266\303\244"); // "u"o"a
+
+	// a mix of invalid sequences
+	std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++";
+	assert ("???" == UTF8::toLower(str));
 }