mirror of
https://github.com/open-source-parsers/jsoncpp.git
synced 2024-12-14 02:35:09 +01:00
Implementing support for reading and writing Unicode escape sequences.
This commit is contained in:
parent
060c45a38d
commit
ee4b4dab54
@ -115,6 +115,10 @@ namespace Json {
|
|||||||
bool decodeString( Token &token );
|
bool decodeString( Token &token );
|
||||||
bool decodeString( Token &token, std::string &decoded );
|
bool decodeString( Token &token, std::string &decoded );
|
||||||
bool decodeDouble( Token &token );
|
bool decodeDouble( Token &token );
|
||||||
|
bool decodeUnicodeCodePoint( Token &token,
|
||||||
|
Location ¤t,
|
||||||
|
Location end,
|
||||||
|
unsigned int &unicode );
|
||||||
bool decodeUnicodeEscapeSequence( Token &token,
|
bool decodeUnicodeEscapeSequence( Token &token,
|
||||||
Location ¤t,
|
Location ¤t,
|
||||||
Location end,
|
Location end,
|
||||||
|
@ -36,6 +36,42 @@ containsNewLine( Reader::Location begin,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string codePointToUTF8(unsigned int cp)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
// based on description from http://en.wikipedia.org/wiki/UTF-8
|
||||||
|
|
||||||
|
if (cp <= 0x7f)
|
||||||
|
{
|
||||||
|
result.resize(1);
|
||||||
|
result[0] = static_cast<char>(cp);
|
||||||
|
}
|
||||||
|
else if (cp <= 0x7FF)
|
||||||
|
{
|
||||||
|
result.resize(2);
|
||||||
|
result[1] = static_cast<char>(0x80 | (0x3f & cp));
|
||||||
|
result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
|
||||||
|
}
|
||||||
|
else if (cp <= 0xFFFF)
|
||||||
|
{
|
||||||
|
result.resize(3);
|
||||||
|
result[2] = static_cast<char>(0x80 | (0x3f & cp));
|
||||||
|
result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
|
||||||
|
result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
|
||||||
|
}
|
||||||
|
else if (cp <= 0x10FFFF)
|
||||||
|
{
|
||||||
|
result.resize(4);
|
||||||
|
result[3] = static_cast<char>(0x80 | (0x3f & cp));
|
||||||
|
result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
|
||||||
|
result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
|
||||||
|
result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Class Reader
|
// Class Reader
|
||||||
// //////////////////////////////////////////////////////////////////
|
// //////////////////////////////////////////////////////////////////
|
||||||
@ -577,10 +613,9 @@ Reader::decodeString( Token &token, std::string &decoded )
|
|||||||
case 'u':
|
case 'u':
|
||||||
{
|
{
|
||||||
unsigned int unicode;
|
unsigned int unicode;
|
||||||
if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
|
if ( !decodeUnicodeCodePoint( token, current, end, unicode ) )
|
||||||
return false;
|
return false;
|
||||||
// @todo encode unicode as utf8.
|
decoded += codePointToUTF8(unicode);
|
||||||
// @todo remember to alter the writer too.
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -595,6 +630,35 @@ Reader::decodeString( Token &token, std::string &decoded )
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
Reader::decodeUnicodeCodePoint( Token &token,
|
||||||
|
Location ¤t,
|
||||||
|
Location end,
|
||||||
|
unsigned int &unicode )
|
||||||
|
{
|
||||||
|
|
||||||
|
if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
|
||||||
|
return false;
|
||||||
|
if (unicode >= 0xD800 && unicode <= 0xDBFF)
|
||||||
|
{
|
||||||
|
// surrogate pairs
|
||||||
|
if (end - current < 6)
|
||||||
|
return addError( "additional six characters expected to parse unicode surrogate pair.", token, current );
|
||||||
|
unsigned int surrogatePair;
|
||||||
|
if (*(current++) == '\\' && *(current++)== 'u')
|
||||||
|
{
|
||||||
|
if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair ))
|
||||||
|
{
|
||||||
|
unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current );
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
Reader::decodeUnicodeEscapeSequence( Token &token,
|
Reader::decodeUnicodeEscapeSequence( Token &token,
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
#if _MSC_VER >= 1400 // VC++ 8.0
|
#if _MSC_VER >= 1400 // VC++ 8.0
|
||||||
#pragma warning( disable : 4996 ) // disable warning about strdup being deprecated.
|
#pragma warning( disable : 4996 ) // disable warning about strdup being deprecated.
|
||||||
@ -11,6 +13,20 @@
|
|||||||
|
|
||||||
namespace Json {
|
namespace Json {
|
||||||
|
|
||||||
|
static bool isControlCharacter(char ch)
|
||||||
|
{
|
||||||
|
return ch > 0 && ch <= 0x1F;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool containsControlCharacter( const char* str )
|
||||||
|
{
|
||||||
|
while ( str )
|
||||||
|
{
|
||||||
|
if ( isControlCharacter( *(str++) ) )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
static void uintToString( unsigned int value,
|
static void uintToString( unsigned int value,
|
||||||
char *¤t )
|
char *¤t )
|
||||||
{
|
{
|
||||||
@ -95,7 +111,7 @@ std::string valueToString( bool value )
|
|||||||
std::string valueToQuotedString( const char *value )
|
std::string valueToQuotedString( const char *value )
|
||||||
{
|
{
|
||||||
// Not sure how to handle unicode...
|
// Not sure how to handle unicode...
|
||||||
if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL)
|
if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL && !containsControlCharacter( value ))
|
||||||
return std::string("\"") + value + "\"";
|
return std::string("\"") + value + "\"";
|
||||||
// We have to walk value and escape any special characters.
|
// We have to walk value and escape any special characters.
|
||||||
// Appending to std::string is not efficient, but this should be rare.
|
// Appending to std::string is not efficient, but this should be rare.
|
||||||
@ -132,8 +148,16 @@ std::string valueToQuotedString( const char *value )
|
|||||||
// slash is also legal, so I see no reason to escape it.
|
// slash is also legal, so I see no reason to escape it.
|
||||||
// (I hope I am not misunderstanding something.)
|
// (I hope I am not misunderstanding something.)
|
||||||
default:
|
default:
|
||||||
result += *c;
|
if ( isControlCharacter( *c ) )
|
||||||
|
{
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "\\u" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << static_cast<int>(*c);
|
||||||
|
result += oss.str();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
result += *c;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
result += "\"";
|
result += "\"";
|
||||||
return result;
|
return result;
|
||||||
|
1
test/test_string_unicode_01.expected
Normal file
1
test/test_string_unicode_01.expected
Normal file
@ -0,0 +1 @@
|
|||||||
|
.="a"
|
1
test/test_string_unicode_01.json
Normal file
1
test/test_string_unicode_01.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
"\u0061"
|
1
test/test_string_unicode_02.expected
Normal file
1
test/test_string_unicode_02.expected
Normal file
@ -0,0 +1 @@
|
|||||||
|
.="¢"
|
1
test/test_string_unicode_02.json
Normal file
1
test/test_string_unicode_02.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
"\u00A2"
|
1
test/test_string_unicode_03.expected
Normal file
1
test/test_string_unicode_03.expected
Normal file
@ -0,0 +1 @@
|
|||||||
|
.="€"
|
1
test/test_string_unicode_03.json
Normal file
1
test/test_string_unicode_03.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
"\u20AC"
|
1
test/test_string_unicode_04.expected
Normal file
1
test/test_string_unicode_04.expected
Normal file
@ -0,0 +1 @@
|
|||||||
|
.="𝄞"
|
1
test/test_string_unicode_04.json
Normal file
1
test/test_string_unicode_04.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
"\uD834\uDD1E"
|
Loading…
Reference in New Issue
Block a user