reduce maximum character sequence length to 4, in accordance with UTF-8

This commit is contained in:
Günter Obiltschnig 2018-02-13 21:46:51 +01:00
parent af09a02a34
commit fab94585bf
3 changed files with 6 additions and 8 deletions

View File

@ -48,7 +48,7 @@ public:
enum
{
MAX_SEQUENCE_LENGTH = 6 /// The maximum character byte sequence length supported.
MAX_SEQUENCE_LENGTH = 4 /// The maximum character byte sequence length supported.
};
typedef int CharacterMap[256];
@ -58,7 +58,7 @@ public:
/// If map[b] is -1, then the byte sequence is malformed.
/// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
/// sequence that encodes a single Unicode scalar value. Byte sequences up
/// to 6 bytes in length are supported.
/// to 4 bytes in length are supported.
virtual ~TextEncoding();
/// Destroys the encoding.
@ -89,6 +89,7 @@ public:
///
/// The convert function must return the Unicode scalar value
/// represented by this byte sequence or -1 if the byte sequence is malformed.
///
/// The default implementation returns (int) bytes[0].
virtual int queryConvert(const unsigned char* bytes, int length) const;
@ -106,6 +107,7 @@ public:
/// Then a second call with length == 2 might return -4
/// Eventually, the third call with length == 4 should return either a
/// Unicode scalar value, or -1 if the byte sequence is malformed.
///
/// The default implementation returns (int) bytes[0].
virtual int sequenceLength(const unsigned char* bytes, int length) const;

View File

@ -44,7 +44,7 @@ const TextEncoding::CharacterMap UTF8Encoding::_charMap =
/* c0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* d0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
/* e0 */ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
/* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -1, -1,
/* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1,
};
@ -88,8 +88,6 @@ int UTF8Encoding::convert(const unsigned char* bytes) const
switch (n)
{
case -6:
case -5:
case -1:
return -1;
case -4:
@ -165,8 +163,6 @@ int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const
{
switch (n)
{
case -6:
case -5:
case -1:
return -1;
case -4:

View File

@ -79,7 +79,7 @@ void UTF8StringTest::testTransform()
// a mix of invalid sequences
std::string str = "\xC2\xE5\xF0\xF8\xE8\xED\xFB+-++";
assert ("???" == UTF8::toLower(str));
assert ("???+-++" == UTF8::toLower(str));
}