reduce maximum character sequence length to 4, in accordance with UTF-8

2025-03-03 12:58:03 +01:00 · 2018-02-13 21:46:51 +01:00 · 2018-02-13 21:46:51 +01:00 · fab94585bf
commit fab94585bf
parent af09a02a34
3 changed files with 6 additions and 8 deletions
--- a/Foundation/include/Poco/TextEncoding.h
+++ b/Foundation/include/Poco/TextEncoding.h
@ -48,7 +48,7 @@ public:
 	
 	enum
 	{
-		MAX_SEQUENCE_LENGTH = 6 /// The maximum character byte sequence length supported.
+		MAX_SEQUENCE_LENGTH = 4 /// The maximum character byte sequence length supported.
 	};
 	
 	typedef int CharacterMap[256];
@ -58,7 +58,7 @@ public:
 		/// If map[b] is -1, then the byte sequence is malformed.
 		/// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
 		/// sequence that encodes a single Unicode scalar value. Byte sequences up
-		/// to 6 bytes in length are supported.
+		/// to 4 bytes in length are supported.

 	virtual ~TextEncoding();
 		/// Destroys the encoding.
@ -89,6 +89,7 @@ public:
 		///
 		/// The convert function must return the Unicode scalar value
 		/// represented by this byte sequence or -1 if the byte sequence is malformed.
+		///
 		/// The default implementation returns (int) bytes[0].

 	virtual	int queryConvert(const unsigned char* bytes, int length) const;
@ -106,6 +107,7 @@ public:
 		/// Then a second call with length == 2 might return -4
 		/// Eventually, the third call with length == 4 should return either a
 		/// Unicode scalar value, or -1 if the byte sequence is malformed.
+		///
 		/// The default implementation returns (int) bytes[0].

 	virtual int sequenceLength(const unsigned char* bytes, int length) const;
--- a/Foundation/src/UTF8Encoding.cpp
+++ b/Foundation/src/UTF8Encoding.cpp
@ -44,7 +44,7 @@ const TextEncoding::CharacterMap UTF8Encoding::_charMap =
 	/* c0 */	  -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,
 	/* d0 */	  -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,
 	/* e0 */	  -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,
-	/* f0 */	  -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -5,   -5,   -5,   -5,   -6,   -6,   -1,   -1,
+	/* f0 */	  -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
 };


@ -88,8 +88,6 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
 	
 	switch (n)
 	{
-	case -6:
-	case -5:
 	case -1:
 		return -1;
 	case -4:
@ -165,8 +163,6 @@ int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const
 	{
 		switch (n)
 		{
-		case -6:
-		case -5:
 		case -1:
 			return -1;
 		case -4:
--- a/Foundation/testsuite/src/UTF8StringTest.cpp
+++ b/Foundation/testsuite/src/UTF8StringTest.cpp
@ -79,7 +79,7 @@ void UTF8StringTest::testTransform()

 	// a mix of invalid sequences
 	std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++";
-	assert ("???" == UTF8::toLower(str));
+	assert ("???+-++" == UTF8::toLower(str));
 }