Implementing support for reading and writing Unicode escape sequences.

This commit is contained in:
Malay Shah 2009-11-13 04:21:14 +00:00
parent 060c45a38d
commit ee4b4dab54
11 changed files with 105 additions and 5 deletions

View File

@ -115,6 +115,10 @@ namespace Json {
bool decodeString( Token &token ); bool decodeString( Token &token );
bool decodeString( Token &token, std::string &decoded ); bool decodeString( Token &token, std::string &decoded );
bool decodeDouble( Token &token ); bool decodeDouble( Token &token );
bool decodeUnicodeCodePoint( Token &token,
Location &current,
Location end,
unsigned int &unicode );
bool decodeUnicodeEscapeSequence( Token &token, bool decodeUnicodeEscapeSequence( Token &token,
Location &current, Location &current,
Location end, Location end,

View File

@ -36,6 +36,42 @@ containsNewLine( Reader::Location begin,
return false; return false;
} }
static std::string codePointToUTF8(unsigned int cp)
{
std::string result;
// based on description from http://en.wikipedia.org/wiki/UTF-8
if (cp <= 0x7f)
{
result.resize(1);
result[0] = static_cast<char>(cp);
}
else if (cp <= 0x7FF)
{
result.resize(2);
result[1] = static_cast<char>(0x80 | (0x3f & cp));
result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
}
else if (cp <= 0xFFFF)
{
result.resize(3);
result[2] = static_cast<char>(0x80 | (0x3f & cp));
result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
}
else if (cp <= 0x10FFFF)
{
result.resize(4);
result[3] = static_cast<char>(0x80 | (0x3f & cp));
result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
}
return result;
}
// Class Reader // Class Reader
// ////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////
@ -577,10 +613,9 @@ Reader::decodeString( Token &token, std::string &decoded )
case 'u': case 'u':
{ {
unsigned int unicode; unsigned int unicode;
if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) ) if ( !decodeUnicodeCodePoint( token, current, end, unicode ) )
return false; return false;
// @todo encode unicode as utf8. decoded += codePointToUTF8(unicode);
// @todo remember to alter the writer too.
} }
break; break;
default: default:
@ -595,6 +630,35 @@ Reader::decodeString( Token &token, std::string &decoded )
return true; return true;
} }
bool
Reader::decodeUnicodeCodePoint( Token &token,
Location &current,
Location end,
unsigned int &unicode )
{
if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
return false;
if (unicode >= 0xD800 && unicode <= 0xDBFF)
{
// surrogate pairs
if (end - current < 6)
return addError( "additional six characters expected to parse unicode surrogate pair.", token, current );
unsigned int surrogatePair;
if (*(current++) == '\\' && *(current++)== 'u')
{
if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair ))
{
unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
}
else
return false;
}
else
return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current );
}
return true;
}
bool bool
Reader::decodeUnicodeEscapeSequence( Token &token, Reader::decodeUnicodeEscapeSequence( Token &token,

View File

@ -4,6 +4,8 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <iostream> #include <iostream>
#include <sstream>
#include <iomanip>
#if _MSC_VER >= 1400 // VC++ 8.0 #if _MSC_VER >= 1400 // VC++ 8.0
#pragma warning( disable : 4996 ) // disable warning about strdup being deprecated. #pragma warning( disable : 4996 ) // disable warning about strdup being deprecated.
@ -11,6 +13,20 @@
namespace Json { namespace Json {
static bool isControlCharacter(char ch)
{
return ch > 0 && ch <= 0x1F;
}
static bool containsControlCharacter( const char* str )
{
while ( str )
{
if ( isControlCharacter( *(str++) ) )
return true;
}
return false;
}
static void uintToString( unsigned int value, static void uintToString( unsigned int value,
char *&current ) char *&current )
{ {
@ -95,7 +111,7 @@ std::string valueToString( bool value )
std::string valueToQuotedString( const char *value ) std::string valueToQuotedString( const char *value )
{ {
// Not sure how to handle unicode... // Not sure how to handle unicode...
if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL) if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL && !containsControlCharacter( value ))
return std::string("\"") + value + "\""; return std::string("\"") + value + "\"";
// We have to walk value and escape any special characters. // We have to walk value and escape any special characters.
// Appending to std::string is not efficient, but this should be rare. // Appending to std::string is not efficient, but this should be rare.
@ -132,8 +148,16 @@ std::string valueToQuotedString( const char *value )
// slash is also legal, so I see no reason to escape it. // slash is also legal, so I see no reason to escape it.
// (I hope I am not misunderstanding something.) // (I hope I am not misunderstanding something.)
default: default:
result += *c; if ( isControlCharacter( *c ) )
{
std::ostringstream oss;
oss << "\\u" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << static_cast<int>(*c);
result += oss.str();
}
else
result += *c;
} }
break;
} }
result += "\""; result += "\"";
return result; return result;

View File

@ -0,0 +1 @@
.="a"

View File

@ -0,0 +1 @@
"\u0061"

View File

@ -0,0 +1 @@
.="¢"

View File

@ -0,0 +1 @@
"\u00A2"

View File

@ -0,0 +1 @@
.="€"

View File

@ -0,0 +1 @@
"\u20AC"

View File

@ -0,0 +1 @@
.="𝄞"

View File

@ -0,0 +1 @@
"\uD834\uDD1E"