trunk/branch integration: TextEncoding update

2025-02-20 06:17:15 +01:00 · 2011-08-22 18:22:56 +00:00 · 2011-08-22 18:22:56 +00:00 · e066d033a2
commit e066d033a2
parent d35ecf85a2
8 changed files with 391 additions and 80 deletions
--- a/Foundation/src/ASCIIEncoding.cpp
+++ b/Foundation/src/ASCIIEncoding.cpp
@ -104,7 +104,7 @@ const TextEncoding::CharacterMap& ASCIIEncoding::characterMap() const

 int ASCIIEncoding::convert(const unsigned char* bytes) const
 {
-	return *bytes;
+	return _charMap[*bytes];
 }


@ -120,4 +120,19 @@ int ASCIIEncoding::convert(int ch, unsigned char* bytes, int length) const
 }


+int ASCIIEncoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	if (1 <= length)
+		return _charMap [*bytes];
+	else
+		return -1;
+}
+
+
+int ASCIIEncoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	return 1;
+}
+
+
 } // namespace Poco
--- a/Foundation/src/Latin1Encoding.cpp
+++ b/Foundation/src/Latin1Encoding.cpp
@ -122,4 +122,19 @@ int Latin1Encoding::convert(int ch, unsigned char* bytes, int length) const
 }


+int Latin1Encoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	if (1 <= length)
+		return *bytes;
+	else
+		return -1;
+}
+
+
+int Latin1Encoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	return 1;
+}
+
+
 } // namespace Poco
--- a/Foundation/src/Latin9Encoding.cpp
+++ b/Foundation/src/Latin9Encoding.cpp
@ -106,13 +106,13 @@ const TextEncoding::CharacterMap& Latin9Encoding::characterMap() const

 int Latin9Encoding::convert(const unsigned char* bytes) const
 {
-	return *bytes;
+	return _charMap[*bytes];
 }


 int Latin9Encoding::convert(int ch, unsigned char* bytes, int length) const
 {
-	if (ch >= 0 && ch <= 255)
+	if (ch >= 0 && ch <= 255 && _charMap[ch] == ch)
 	{
 		if (bytes && length >= 1)
 			*bytes = ch;
@ -133,4 +133,19 @@ int Latin9Encoding::convert(int ch, unsigned char* bytes, int length) const
 }


+int Latin9Encoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	if (1 <= length)
+		return _charMap[*bytes];
+	else
+		return -1;
+}
+
+
+int Latin9Encoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	return 1;
+}
+
+
 } // namespace Poco
--- a/Foundation/src/TextEncoding.cpp
+++ b/Foundation/src/TextEncoding.cpp
@ -151,7 +151,7 @@ TextEncoding::~TextEncoding()

 int TextEncoding::convert(const unsigned char* bytes) const
 {
-	return (int) *bytes;
+	return static_cast<int>(*bytes);
 }


@ -161,6 +161,18 @@ int TextEncoding::convert(int ch, unsigned char* bytes, int length) const
 }


+int TextEncoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	return (int) *bytes;
+}
+
+
+int TextEncoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	return 1;
+}
+
+
 TextEncoding& TextEncoding::byName(const std::string& encodingName)
 {
 	TextEncoding* pEncoding = manager().find(encodingName);
@ -209,9 +221,14 @@ TextEncoding& TextEncoding::global()
 }


-TextEncodingManager& TextEncoding::manager()
+namespace
 {
 	static SingletonHolder<TextEncodingManager> sh;
+}
+
+
+TextEncodingManager& TextEncoding::manager()
+{
 	return *sh.get();
 }

--- a/Foundation/src/TextIterator.cpp
+++ b/Foundation/src/TextIterator.cpp
@ -55,6 +55,7 @@ TextIterator::TextIterator(const std::string& str, const TextEncoding& encoding)
 {
 }

+
 TextIterator::TextIterator(const std::string::const_iterator& begin, const std::string::const_iterator& end, const TextEncoding& encoding):
 	_pEncoding(&encoding),
 	_it(begin),
@ -116,22 +117,36 @@ int TextIterator::operator * () const
 {
 	poco_check_ptr (_pEncoding);
 	poco_assert (_it != _end);
+	std::string::const_iterator it = _it;
 	
-	unsigned char c = (unsigned char) *_it;
-	int n = _pEncoding->characterMap()[c];
-	if (n >= -1)
-		return n;
+	unsigned char buffer[TextEncoding::MAX_SEQUENCE_LENGTH];
+	unsigned char* p = buffer;
+
+	if (it != _end)
+		*p++ = *it++;
+	else
+		*p++ = 0;
+
+	int read = 1;
+	int n = _pEncoding->queryConvert(buffer, 1);
+
+	while (-1 > n && (_end - it) >= -n - read)
+	{
+		while (read < -n && it != _end)
+		{ 
+			*p++ = *it++; 
+			read++; 
+		}
+		n = _pEncoding->queryConvert(buffer, read);
+	}
+
+	if (-1 > n)
+	{
+		return -1;
+	}
 	else
 	{
-		poco_assert_dbg (n >= -TextEncoding::MAX_SEQUENCE_LENGTH);
-		unsigned char buffer[TextEncoding::MAX_SEQUENCE_LENGTH];
-		unsigned char* p = buffer;
-		std::string::const_iterator it = _it;
-		while (n < 0 && it != _end) { *p++ = *it++; ++n; }
-		if (n == 0)
-			return _pEncoding->convert(buffer);
-		else
-			return -1;
+		return n;
 	}
 }

@ -141,12 +156,31 @@ TextIterator& TextIterator::operator ++ ()
 	poco_check_ptr (_pEncoding);
 	poco_assert (_it != _end);
 	
-	unsigned char c = (unsigned char) *_it;
-	int n = _pEncoding->characterMap()[c];
-	if (n >= -1)
-		++_it;
+	unsigned char buffer[TextEncoding::MAX_SEQUENCE_LENGTH];
+	unsigned char* p = buffer;
+
+	if (_it != _end)
+		*p++ = *_it++;
 	else
-		while (n < 0 && _it != _end) { ++_it; ++n; }
+		*p++ = 0;
+
+	int read = 1;
+	int n = _pEncoding->sequenceLength(buffer, 1);
+
+	while (-1 > n && (_end - _it) >= -n - read)
+	{
+		while (read < -n && _it != _end)
+		{ 
+			*p++ = *_it++; 
+			read++; 
+		}
+		n = _pEncoding->sequenceLength(buffer, read);
+	}
+	while (read < n && _it != _end)
+	{ 
+		_it++; 
+		read++; 
+	}

 	return *this;
 }
--- a/Foundation/src/UTF16Encoding.cpp
+++ b/Foundation/src/UTF16Encoding.cpp
@ -143,7 +143,36 @@ int UTF16Encoding::convert(const unsigned char* bytes) const
 	unsigned char* p = (unsigned char*) &uc;
 	*p++ = *bytes++;
 	*p++ = *bytes++;
-	return _flipBytes ? ByteOrder::flipBytes(uc) : uc;
+
+	if (_flipBytes)
+	{
+		ByteOrder::flipBytes(uc);
+	}
+
+	if (uc >= 0xd800 && uc < 0xdc00)
+	{
+		UInt16 uc2;
+		p = (unsigned char*) &uc2;
+		*p++ = *bytes++;
+		*p++ = *bytes++;
+
+		if (_flipBytes)
+		{
+			ByteOrder::flipBytes(uc2);
+		}
+		if (uc2 >= 0xdc00 && uc2 < 0xe000)
+		{
+			return ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
+		}
+		else
+		{
+			return -1;
+		}
+	}
+	else
+	{
+		return uc;
+	}
 }


@ -184,4 +213,83 @@ int UTF16Encoding::convert(int ch, unsigned char* bytes, int length) const
 }


+int UTF16Encoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	int ret = -2;
+
+	if (length >= 2)
+	{
+		UInt16 uc;
+		unsigned char* p = (unsigned char*) &uc;
+		*p++ = *bytes++;
+		*p++ = *bytes++;
+		if (_flipBytes) 
+			ByteOrder::flipBytes(ret);
+		if (uc >= 0xd800 && uc < 0xdc00)
+		{
+			if (length >= 4)
+			{
+				UInt16 uc2;
+				p = (unsigned char*) &uc2;
+				*p++ = *bytes++;
+				*p++ = *bytes++;
+				if (_flipBytes) 
+					ByteOrder::flipBytes(ret);
+				if (uc2 >= 0xdc00 && uc < 0xe000)
+				{
+					ret = ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
+				}
+				else
+				{
+					ret = -1;	// Malformed sequence
+				}
+			}
+			else
+			{
+				ret = -4;	// surrogate pair, four bytes needed
+			}
+		}
+		else
+		{
+			ret = uc;
+		}
+	}
+
+	return ret;
+}
+
+
+int UTF16Encoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	int ret = -2;
+
+	if (_flipBytes)
+	{
+		if (length >= 1)
+		{
+			unsigned char c = *bytes;
+			if (c >= 0xd8 && c < 0xdc)
+				ret = 4;
+			else
+				ret = 2;
+		}
+	}
+	else
+	{
+		if (length >= 2)
+		{
+			UInt16 uc;
+			unsigned char* p = (unsigned char*) &uc;
+			*p++ = *bytes++;
+			*p++ = *bytes++;
+			if (uc >= 0xd800 && uc < 0xdc00)
+				ret = 4;
+			else
+				ret = 2;
+		}
+	}
+	return ret;
+}
+
+
 } // namespace Poco
--- a/Foundation/src/UTF8Encoding.cpp
+++ b/Foundation/src/UTF8Encoding.cpp
@ -103,37 +103,6 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
 }


-bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
-{
-	if (0 == bytes || 0 == length) return false;
-
-    unsigned char a;
-    const unsigned char* srcptr = bytes + length;
-    switch (length)
-	{
-		default: return false;
-		case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
-		case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
-		case 2: if ((a = (*--srcptr)) > 0xBF) return false;
-
-			switch (*bytes) 
-			{
-				case 0xE0: if (a < 0xA0) return false; break;
-				case 0xED: if (a > 0x9F) return false; break;
-				case 0xF0: if (a < 0x90) return false; break;
-				case 0xF4: if (a > 0x8F) return false; break;
-				default:   if (a < 0x80) return false;
-			}
-			
-		case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false;
-    }
-
-	if (*bytes > 0xF4) return false;
-
-	return true;
-}
-
-
 int UTF8Encoding::convert(const unsigned char* bytes) const
 {
 	int n = _charMap[*bytes];
@ -141,13 +110,18 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
 	
 	switch (n)
 	{
-		case -6: case -5: case -1: return -1;
-
-		case -4: case -3: case -2:
-			if (!isLegal(bytes, -n)) return -1;
-			uc = *bytes & ((0x07 << (n + 4)) | 0x03); break;
-
-		default: return n;
+	case -6:
+	case -5:
+	case -1:
+		return -1;
+	case -4: 
+	case -3: 
+	case -2:
+		if (!isLegal(bytes, -n)) return -1;
+		uc = *bytes & ((0x07 << (n + 4)) | 0x03);
+		break;
+	default:
+		return n;
 	}

 	while (n++ < -1) 
@ -161,10 +135,6 @@ int UTF8Encoding::convert(const unsigned char* bytes) const

 int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 {
-#ifdef _DEBUG
-	unsigned char* lb = bytes;
-#endif
-
 	if (ch <= 0x7F)
 	{
 		if (bytes && length >= 1)
@ -175,37 +145,128 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
 	{
 		if (bytes && length >= 2)
 		{
-			*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
+			*bytes++ = (unsigned char) (((ch >> 6) & 0x1F) | 0xC0);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
-		poco_assert_dbg (isLegal(lb, 2));
 		return 2;
 	}
 	else if (ch <= 0xFFFF)
 	{
 		if (bytes && length >= 3)
 		{
-			*bytes++ = (unsigned char) ((ch >> 12) & 0x0F | 0xE0);
-			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
+			*bytes++ = (unsigned char) (((ch >> 12) & 0x0F) | 0xE0);
+			*bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
-		poco_assert_dbg (isLegal(lb, 3));
 		return 3;
 	}
 	else if (ch <= 0x10FFFF)
 	{
 		if (bytes && length >= 4)
 		{
-			*bytes++ = (unsigned char) ((ch >> 18) & 0x07 | 0xF0);
-			*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
-			*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
+			*bytes++ = (unsigned char) (((ch >> 18) & 0x07) | 0xF0);
+			*bytes++ = (unsigned char) (((ch >> 12) & 0x3F) | 0x80);
+			*bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80);
 			*bytes   = (unsigned char) ((ch & 0x3F) | 0x80);
 		}
-		poco_assert_dbg (isLegal(lb, 4));
 		return 4;
 	}
 	else return 0;
 }


+int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	int n = _charMap[*bytes];
+	int uc;
+	if (-n > length)
+	{
+		return n;
+	}
+	else
+	{
+		switch (n)
+		{
+		case -6:
+		case -5:
+		case -1:
+			return -1;
+		case -4:
+		case -3:
+		case -2:
+			if (!isLegal(bytes, -n)) return -1;
+			uc = *bytes & ((0x07 << (n + 4)) | 0x03);
+			break;
+		default:
+			return n;
+		}
+		while (n++ < -1) 
+		{	
+			uc <<= 6;
+			uc |= (*++bytes & 0x3F);
+		}
+		return uc;
+	}
+}
+
+
+int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	if (1 <= length)
+	{
+		int cc = _charMap[*bytes];
+		if (cc >= 0)
+			return 1;
+		else
+			return -cc;
+	}
+	else return -1;
+}
+
+
+bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
+{
+	// Note: The following is loosely based on the isLegalUTF8 function
+	// from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
+	// Excuse the ugliness...
+	
+	if (0 == bytes || 0 == length) return false;
+
+    unsigned char a;
+    const unsigned char* srcptr = bytes + length;
+    switch (length)
+	{
+	default:
+		return false;
+		// Everything else falls through when true.
+	case 4:
+		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+	case 3: 
+		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+	case 2:
+		if ((a = (*--srcptr)) > 0xBF) return false;
+		switch (*bytes) 
+		{
+		case 0xE0:
+			if (a < 0xA0) return false; 
+			break;
+		case 0xED:
+			if (a > 0x9F) return false; 
+			break;
+		case 0xF0:
+			if (a < 0x90) return false; 
+			break;
+		case 0xF4:
+			if (a > 0x8F) return false; 
+			break;
+		default:
+			if (a < 0x80) return false;
+		}
+	case 1:
+		if (*bytes >= 0x80 && *bytes < 0xC2) return false;
+    }
+	return *bytes <= 0xF4;
+}
+
+
 } // namespace Poco
--- a/Foundation/src/Windows1252Encoding.cpp
+++ b/Foundation/src/Windows1252Encoding.cpp
@ -1,7 +1,7 @@
 //
 // Windows1252Encoding.cpp
 //
-// $Id: //poco/Main/Foundation/src/Windows1252Encoding.cpp#7 $
+// $Id: //poco/1.4/Foundation/src/Windows1252Encoding.cpp#1 $
 //
 // Library: Foundation
 // Package: Text
@ -36,6 +36,7 @@

 #include "Poco/Windows1252Encoding.h"
 #include "Poco/String.h"
+#include <map>


 namespace Poco {
@ -106,19 +107,64 @@ const TextEncoding::CharacterMap& Windows1252Encoding::characterMap() const

 int Windows1252Encoding::convert(const unsigned char* bytes) const
 {
-	return *bytes;
+	return _charMap[*bytes];
 }


 int Windows1252Encoding::convert(int ch, unsigned char* bytes, int length) const
 {
-	if (ch >= 0 && ch <= 255)
+	if (ch >= 0 && ch <= 255 && _charMap[ch] == ch)
 	{
 		if (bytes && length >= 1)
-			*bytes = (unsigned char) ch;
+			*bytes = ch;
 		return 1;
 	}
-	else return 0;
+	else switch (ch)
+	{
+	case 0x20ac: if (bytes && length >= 1) *bytes = 0x80; return 1;
+	case 0x201a: if (bytes && length >= 1) *bytes = 0x82; return 1;
+	case 0x0192: if (bytes && length >= 1) *bytes = 0x83; return 1;
+	case 0x201e: if (bytes && length >= 1) *bytes = 0x84; return 1;
+	case 0x2026: if (bytes && length >= 1) *bytes = 0x85; return 1;
+	case 0x2020: if (bytes && length >= 1) *bytes = 0x86; return 1;
+	case 0x2021: if (bytes && length >= 1) *bytes = 0x87; return 1;
+	case 0x02c6: if (bytes && length >= 1) *bytes = 0x88; return 1;
+	case 0x2030: if (bytes && length >= 1) *bytes = 0x89; return 1;
+	case 0x0160: if (bytes && length >= 1) *bytes = 0x8a; return 1;
+	case 0x2039: if (bytes && length >= 1) *bytes = 0x8b; return 1;
+	case 0x0152: if (bytes && length >= 1) *bytes = 0x8c; return 1;
+	case 0x017d: if (bytes && length >= 1) *bytes = 0x8e; return 1;
+	case 0x2018: if (bytes && length >= 1) *bytes = 0x91; return 1;
+	case 0x2019: if (bytes && length >= 1) *bytes = 0x92; return 1;
+	case 0x201c: if (bytes && length >= 1) *bytes = 0x93; return 1;
+	case 0x201d: if (bytes && length >= 1) *bytes = 0x94; return 1;
+	case 0x2022: if (bytes && length >= 1) *bytes = 0x95; return 1;
+	case 0x2013: if (bytes && length >= 1) *bytes = 0x96; return 1;
+	case 0x2014: if (bytes && length >= 1) *bytes = 0x97; return 1;
+	case 0x02dc: if (bytes && length >= 1) *bytes = 0x98; return 1;
+	case 0x2122: if (bytes && length >= 1) *bytes = 0x99; return 1;
+	case 0x0161: if (bytes && length >= 1) *bytes = 0x9a; return 1;
+	case 0x203a: if (bytes && length >= 1) *bytes = 0x9b; return 1;
+	case 0x0153: if (bytes && length >= 1) *bytes = 0x9c; return 1;
+	case 0x017e: if (bytes && length >= 1) *bytes = 0x9e; return 1;
+	case 0x0178: if (bytes && length >= 1) *bytes = 0x9f; return 1;
+	default: return 0;
+	}
+}
+
+
+int Windows1252Encoding::queryConvert(const unsigned char* bytes, int length) const
+{
+	if (1 <= length)
+		return _charMap[*bytes];
+	else
+		return -1;
+}
+
+
+int Windows1252Encoding::sequenceLength(const unsigned char* bytes, int length) const
+{
+	return 1;
 }