trunk/branch integration: TextEncoding update

This commit is contained in:
Marian Krivos 2011-08-22 18:22:56 +00:00
parent d35ecf85a2
commit e066d033a2
8 changed files with 391 additions and 80 deletions

View File

@ -104,7 +104,7 @@ const TextEncoding::CharacterMap& ASCIIEncoding::characterMap() const
int ASCIIEncoding::convert(const unsigned char* bytes) const
{
return *bytes;
return _charMap[*bytes];
}
@ -120,4 +120,19 @@ int ASCIIEncoding::convert(int ch, unsigned char* bytes, int length) const
}
int ASCIIEncoding::queryConvert(const unsigned char* bytes, int length) const
{
if (1 <= length)
return _charMap [*bytes];
else
return -1;
}
int ASCIIEncoding::sequenceLength(const unsigned char* bytes, int length) const
{
return 1;
}
} // namespace Poco

View File

@ -122,4 +122,19 @@ int Latin1Encoding::convert(int ch, unsigned char* bytes, int length) const
}
int Latin1Encoding::queryConvert(const unsigned char* bytes, int length) const
{
if (1 <= length)
return *bytes;
else
return -1;
}
int Latin1Encoding::sequenceLength(const unsigned char* bytes, int length) const
{
return 1;
}
} // namespace Poco

View File

@ -106,13 +106,13 @@ const TextEncoding::CharacterMap& Latin9Encoding::characterMap() const
int Latin9Encoding::convert(const unsigned char* bytes) const
{
return *bytes;
return _charMap[*bytes];
}
int Latin9Encoding::convert(int ch, unsigned char* bytes, int length) const
{
if (ch >= 0 && ch <= 255)
if (ch >= 0 && ch <= 255 && _charMap[ch] == ch)
{
if (bytes && length >= 1)
*bytes = ch;
@ -133,4 +133,19 @@ int Latin9Encoding::convert(int ch, unsigned char* bytes, int length) const
}
int Latin9Encoding::queryConvert(const unsigned char* bytes, int length) const
{
if (1 <= length)
return _charMap[*bytes];
else
return -1;
}
int Latin9Encoding::sequenceLength(const unsigned char* bytes, int length) const
{
return 1;
}
} // namespace Poco

View File

@ -151,7 +151,7 @@ TextEncoding::~TextEncoding()
int TextEncoding::convert(const unsigned char* bytes) const
{
return (int) *bytes;
return static_cast<int>(*bytes);
}
@ -161,6 +161,18 @@ int TextEncoding::convert(int ch, unsigned char* bytes, int length) const
}
int TextEncoding::queryConvert(const unsigned char* bytes, int length) const
{
return (int) *bytes;
}
int TextEncoding::sequenceLength(const unsigned char* bytes, int length) const
{
return 1;
}
TextEncoding& TextEncoding::byName(const std::string& encodingName)
{
TextEncoding* pEncoding = manager().find(encodingName);
@ -209,9 +221,14 @@ TextEncoding& TextEncoding::global()
}
TextEncodingManager& TextEncoding::manager()
namespace
{
static SingletonHolder<TextEncodingManager> sh;
}
TextEncodingManager& TextEncoding::manager()
{
return *sh.get();
}

View File

@ -55,6 +55,7 @@ TextIterator::TextIterator(const std::string& str, const TextEncoding& encoding)
{
}
TextIterator::TextIterator(const std::string::const_iterator& begin, const std::string::const_iterator& end, const TextEncoding& encoding):
_pEncoding(&encoding),
_it(begin),
@ -116,22 +117,36 @@ int TextIterator::operator * () const
{
poco_check_ptr (_pEncoding);
poco_assert (_it != _end);
std::string::const_iterator it = _it;
unsigned char c = (unsigned char) *_it;
int n = _pEncoding->characterMap()[c];
if (n >= -1)
return n;
unsigned char buffer[TextEncoding::MAX_SEQUENCE_LENGTH];
unsigned char* p = buffer;
if (it != _end)
*p++ = *it++;
else
*p++ = 0;
int read = 1;
int n = _pEncoding->queryConvert(buffer, 1);
while (-1 > n && (_end - it) >= -n - read)
{
while (read < -n && it != _end)
{
*p++ = *it++;
read++;
}
n = _pEncoding->queryConvert(buffer, read);
}
if (-1 > n)
{
return -1;
}
else
{
poco_assert_dbg (n >= -TextEncoding::MAX_SEQUENCE_LENGTH);
unsigned char buffer[TextEncoding::MAX_SEQUENCE_LENGTH];
unsigned char* p = buffer;
std::string::const_iterator it = _it;
while (n < 0 && it != _end) { *p++ = *it++; ++n; }
if (n == 0)
return _pEncoding->convert(buffer);
else
return -1;
return n;
}
}
@ -141,12 +156,31 @@ TextIterator& TextIterator::operator ++ ()
poco_check_ptr (_pEncoding);
poco_assert (_it != _end);
unsigned char c = (unsigned char) *_it;
int n = _pEncoding->characterMap()[c];
if (n >= -1)
++_it;
unsigned char buffer[TextEncoding::MAX_SEQUENCE_LENGTH];
unsigned char* p = buffer;
if (_it != _end)
*p++ = *_it++;
else
while (n < 0 && _it != _end) { ++_it; ++n; }
*p++ = 0;
int read = 1;
int n = _pEncoding->sequenceLength(buffer, 1);
while (-1 > n && (_end - _it) >= -n - read)
{
while (read < -n && _it != _end)
{
*p++ = *_it++;
read++;
}
n = _pEncoding->sequenceLength(buffer, read);
}
while (read < n && _it != _end)
{
_it++;
read++;
}
return *this;
}

View File

@ -143,7 +143,36 @@ int UTF16Encoding::convert(const unsigned char* bytes) const
unsigned char* p = (unsigned char*) &uc;
*p++ = *bytes++;
*p++ = *bytes++;
return _flipBytes ? ByteOrder::flipBytes(uc) : uc;
if (_flipBytes)
{
ByteOrder::flipBytes(uc);
}
if (uc >= 0xd800 && uc < 0xdc00)
{
UInt16 uc2;
p = (unsigned char*) &uc2;
*p++ = *bytes++;
*p++ = *bytes++;
if (_flipBytes)
{
ByteOrder::flipBytes(uc2);
}
if (uc2 >= 0xdc00 && uc2 < 0xe000)
{
return ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
}
else
{
return -1;
}
}
else
{
return uc;
}
}
@ -184,4 +213,83 @@ int UTF16Encoding::convert(int ch, unsigned char* bytes, int length) const
}
int UTF16Encoding::queryConvert(const unsigned char* bytes, int length) const
{
int ret = -2;
if (length >= 2)
{
UInt16 uc;
unsigned char* p = (unsigned char*) &uc;
*p++ = *bytes++;
*p++ = *bytes++;
if (_flipBytes)
ByteOrder::flipBytes(ret);
if (uc >= 0xd800 && uc < 0xdc00)
{
if (length >= 4)
{
UInt16 uc2;
p = (unsigned char*) &uc2;
*p++ = *bytes++;
*p++ = *bytes++;
if (_flipBytes)
ByteOrder::flipBytes(ret);
if (uc2 >= 0xdc00 && uc < 0xe000)
{
ret = ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
}
else
{
ret = -1; // Malformed sequence
}
}
else
{
ret = -4; // surrogate pair, four bytes needed
}
}
else
{
ret = uc;
}
}
return ret;
}
int UTF16Encoding::sequenceLength(const unsigned char* bytes, int length) const
{
int ret = -2;
if (_flipBytes)
{
if (length >= 1)
{
unsigned char c = *bytes;
if (c >= 0xd8 && c < 0xdc)
ret = 4;
else
ret = 2;
}
}
else
{
if (length >= 2)
{
UInt16 uc;
unsigned char* p = (unsigned char*) &uc;
*p++ = *bytes++;
*p++ = *bytes++;
if (uc >= 0xd800 && uc < 0xdc00)
ret = 4;
else
ret = 2;
}
}
return ret;
}
} // namespace Poco

View File

@ -103,37 +103,6 @@ const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
}
bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
{
if (0 == bytes || 0 == length) return false;
unsigned char a;
const unsigned char* srcptr = bytes + length;
switch (length)
{
default: return false;
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*bytes)
{
case 0xE0: if (a < 0xA0) return false; break;
case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*bytes >= 0x80 && *bytes < 0xC2) return false;
}
if (*bytes > 0xF4) return false;
return true;
}
int UTF8Encoding::convert(const unsigned char* bytes) const
{
int n = _charMap[*bytes];
@ -141,13 +110,18 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
switch (n)
{
case -6: case -5: case -1: return -1;
case -4: case -3: case -2:
if (!isLegal(bytes, -n)) return -1;
uc = *bytes & ((0x07 << (n + 4)) | 0x03); break;
default: return n;
case -6:
case -5:
case -1:
return -1;
case -4:
case -3:
case -2:
if (!isLegal(bytes, -n)) return -1;
uc = *bytes & ((0x07 << (n + 4)) | 0x03);
break;
default:
return n;
}
while (n++ < -1)
@ -161,10 +135,6 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
{
#ifdef _DEBUG
unsigned char* lb = bytes;
#endif
if (ch <= 0x7F)
{
if (bytes && length >= 1)
@ -175,37 +145,128 @@ int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
{
if (bytes && length >= 2)
{
*bytes++ = (unsigned char) ((ch >> 6) & 0x1F | 0xC0);
*bytes++ = (unsigned char) (((ch >> 6) & 0x1F) | 0xC0);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
poco_assert_dbg (isLegal(lb, 2));
return 2;
}
else if (ch <= 0xFFFF)
{
if (bytes && length >= 3)
{
*bytes++ = (unsigned char) ((ch >> 12) & 0x0F | 0xE0);
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
*bytes++ = (unsigned char) (((ch >> 12) & 0x0F) | 0xE0);
*bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
poco_assert_dbg (isLegal(lb, 3));
return 3;
}
else if (ch <= 0x10FFFF)
{
if (bytes && length >= 4)
{
*bytes++ = (unsigned char) ((ch >> 18) & 0x07 | 0xF0);
*bytes++ = (unsigned char) ((ch >> 12) & 0x3F | 0x80);
*bytes++ = (unsigned char) ((ch >> 6) & 0x3F | 0x80);
*bytes++ = (unsigned char) (((ch >> 18) & 0x07) | 0xF0);
*bytes++ = (unsigned char) (((ch >> 12) & 0x3F) | 0x80);
*bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80);
*bytes = (unsigned char) ((ch & 0x3F) | 0x80);
}
poco_assert_dbg (isLegal(lb, 4));
return 4;
}
else return 0;
}
int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const
{
int n = _charMap[*bytes];
int uc;
if (-n > length)
{
return n;
}
else
{
switch (n)
{
case -6:
case -5:
case -1:
return -1;
case -4:
case -3:
case -2:
if (!isLegal(bytes, -n)) return -1;
uc = *bytes & ((0x07 << (n + 4)) | 0x03);
break;
default:
return n;
}
while (n++ < -1)
{
uc <<= 6;
uc |= (*++bytes & 0x3F);
}
return uc;
}
}
int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const
{
if (1 <= length)
{
int cc = _charMap[*bytes];
if (cc >= 0)
return 1;
else
return -cc;
}
else return -1;
}
bool UTF8Encoding::isLegal(const unsigned char *bytes, int length)
{
// Note: The following is loosely based on the isLegalUTF8 function
// from ftp://ftp.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
// Excuse the ugliness...
if (0 == bytes || 0 == length) return false;
unsigned char a;
const unsigned char* srcptr = bytes + length;
switch (length)
{
default:
return false;
// Everything else falls through when true.
case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2:
if ((a = (*--srcptr)) > 0xBF) return false;
switch (*bytes)
{
case 0xE0:
if (a < 0xA0) return false;
break;
case 0xED:
if (a > 0x9F) return false;
break;
case 0xF0:
if (a < 0x90) return false;
break;
case 0xF4:
if (a > 0x8F) return false;
break;
default:
if (a < 0x80) return false;
}
case 1:
if (*bytes >= 0x80 && *bytes < 0xC2) return false;
}
return *bytes <= 0xF4;
}
} // namespace Poco

View File

@ -1,7 +1,7 @@
//
// Windows1252Encoding.cpp
//
// $Id: //poco/Main/Foundation/src/Windows1252Encoding.cpp#7 $
// $Id: //poco/1.4/Foundation/src/Windows1252Encoding.cpp#1 $
//
// Library: Foundation
// Package: Text
@ -36,6 +36,7 @@
#include "Poco/Windows1252Encoding.h"
#include "Poco/String.h"
#include <map>
namespace Poco {
@ -106,19 +107,64 @@ const TextEncoding::CharacterMap& Windows1252Encoding::characterMap() const
int Windows1252Encoding::convert(const unsigned char* bytes) const
{
return *bytes;
return _charMap[*bytes];
}
int Windows1252Encoding::convert(int ch, unsigned char* bytes, int length) const
{
if (ch >= 0 && ch <= 255)
if (ch >= 0 && ch <= 255 && _charMap[ch] == ch)
{
if (bytes && length >= 1)
*bytes = (unsigned char) ch;
*bytes = ch;
return 1;
}
else return 0;
else switch (ch)
{
case 0x20ac: if (bytes && length >= 1) *bytes = 0x80; return 1;
case 0x201a: if (bytes && length >= 1) *bytes = 0x82; return 1;
case 0x0192: if (bytes && length >= 1) *bytes = 0x83; return 1;
case 0x201e: if (bytes && length >= 1) *bytes = 0x84; return 1;
case 0x2026: if (bytes && length >= 1) *bytes = 0x85; return 1;
case 0x2020: if (bytes && length >= 1) *bytes = 0x86; return 1;
case 0x2021: if (bytes && length >= 1) *bytes = 0x87; return 1;
case 0x02c6: if (bytes && length >= 1) *bytes = 0x88; return 1;
case 0x2030: if (bytes && length >= 1) *bytes = 0x89; return 1;
case 0x0160: if (bytes && length >= 1) *bytes = 0x8a; return 1;
case 0x2039: if (bytes && length >= 1) *bytes = 0x8b; return 1;
case 0x0152: if (bytes && length >= 1) *bytes = 0x8c; return 1;
case 0x017d: if (bytes && length >= 1) *bytes = 0x8e; return 1;
case 0x2018: if (bytes && length >= 1) *bytes = 0x91; return 1;
case 0x2019: if (bytes && length >= 1) *bytes = 0x92; return 1;
case 0x201c: if (bytes && length >= 1) *bytes = 0x93; return 1;
case 0x201d: if (bytes && length >= 1) *bytes = 0x94; return 1;
case 0x2022: if (bytes && length >= 1) *bytes = 0x95; return 1;
case 0x2013: if (bytes && length >= 1) *bytes = 0x96; return 1;
case 0x2014: if (bytes && length >= 1) *bytes = 0x97; return 1;
case 0x02dc: if (bytes && length >= 1) *bytes = 0x98; return 1;
case 0x2122: if (bytes && length >= 1) *bytes = 0x99; return 1;
case 0x0161: if (bytes && length >= 1) *bytes = 0x9a; return 1;
case 0x203a: if (bytes && length >= 1) *bytes = 0x9b; return 1;
case 0x0153: if (bytes && length >= 1) *bytes = 0x9c; return 1;
case 0x017e: if (bytes && length >= 1) *bytes = 0x9e; return 1;
case 0x0178: if (bytes && length >= 1) *bytes = 0x9f; return 1;
default: return 0;
}
}
int Windows1252Encoding::queryConvert(const unsigned char* bytes, int length) const
{
if (1 <= length)
return _charMap[*bytes];
else
return -1;
}
int Windows1252Encoding::sequenceLength(const unsigned char* bytes, int length) const
{
return 1;
}