From ee4b4dab5471ceae073f6fb5f19b3b1c37edc2d6 Mon Sep 17 00:00:00 2001 From: Malay Shah Date: Fri, 13 Nov 2009 04:21:14 +0000 Subject: [PATCH] Implementing support for reading and writing Unicode escape sequences. --- include/json/reader.h | 4 ++ src/lib_json/json_reader.cpp | 70 ++++++++++++++++++++++++++-- src/lib_json/json_writer.cpp | 28 ++++++++++- test/test_string_unicode_01.expected | 1 + test/test_string_unicode_01.json | 1 + test/test_string_unicode_02.expected | 1 + test/test_string_unicode_02.json | 1 + test/test_string_unicode_03.expected | 1 + test/test_string_unicode_03.json | 1 + test/test_string_unicode_04.expected | 1 + test/test_string_unicode_04.json | 1 + 11 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 test/test_string_unicode_01.expected create mode 100644 test/test_string_unicode_01.json create mode 100644 test/test_string_unicode_02.expected create mode 100644 test/test_string_unicode_02.json create mode 100644 test/test_string_unicode_03.expected create mode 100644 test/test_string_unicode_03.json create mode 100644 test/test_string_unicode_04.expected create mode 100644 test/test_string_unicode_04.json diff --git a/include/json/reader.h b/include/json/reader.h index f1bc5a2..e113569 100644 --- a/include/json/reader.h +++ b/include/json/reader.h @@ -115,6 +115,10 @@ namespace Json { bool decodeString( Token &token ); bool decodeString( Token &token, std::string &decoded ); bool decodeDouble( Token &token ); + bool decodeUnicodeCodePoint( Token &token, + Location ¤t, + Location end, + unsigned int &unicode ); bool decodeUnicodeEscapeSequence( Token &token, Location ¤t, Location end, diff --git a/src/lib_json/json_reader.cpp b/src/lib_json/json_reader.cpp index 9869686..0e0c2ff 100644 --- a/src/lib_json/json_reader.cpp +++ b/src/lib_json/json_reader.cpp @@ -36,6 +36,42 @@ containsNewLine( Reader::Location begin, return false; } +static std::string codePointToUTF8(unsigned int cp) +{ + std::string result; + + // based on description from http://en.wikipedia.org/wiki/UTF-8 + + if (cp <= 0x7f) + { + result.resize(1); + result[0] = static_cast(cp); + } + else if (cp <= 0x7FF) + { + result.resize(2); + result[1] = static_cast(0x80 | (0x3f & cp)); + result[0] = static_cast(0xC0 | (0x1f & (cp >> 6))); + } + else if (cp <= 0xFFFF) + { + result.resize(3); + result[2] = static_cast(0x80 | (0x3f & cp)); + result[1] = 0x80 | static_cast((0x3f & (cp >> 6))); + result[0] = 0xE0 | static_cast((0xf & (cp >> 12))); + } + else if (cp <= 0x10FFFF) + { + result.resize(4); + result[3] = static_cast(0x80 | (0x3f & cp)); + result[2] = static_cast(0x80 | (0x3f & (cp >> 6))); + result[1] = static_cast(0x80 | (0x3f & (cp >> 12))); + result[0] = static_cast(0xF0 | (0x7 & (cp >> 18))); + } + + return result; +} + // Class Reader // ////////////////////////////////////////////////////////////////// @@ -577,10 +613,9 @@ Reader::decodeString( Token &token, std::string &decoded ) case 'u': { unsigned int unicode; - if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) ) + if ( !decodeUnicodeCodePoint( token, current, end, unicode ) ) return false; - // @todo encode unicode as utf8. - // @todo remember to alter the writer too. + decoded += codePointToUTF8(unicode); } break; default: @@ -595,6 +630,35 @@ Reader::decodeString( Token &token, std::string &decoded ) return true; } +bool +Reader::decodeUnicodeCodePoint( Token &token, + Location ¤t, + Location end, + unsigned int &unicode ) +{ + + if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) ) + return false; + if (unicode >= 0xD800 && unicode <= 0xDBFF) + { + // surrogate pairs + if (end - current < 6) + return addError( "additional six characters expected to parse unicode surrogate pair.", token, current ); + unsigned int surrogatePair; + if (*(current++) == '\\' && *(current++)== 'u') + { + if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair )) + { + unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF); + } + else + return false; + } + else + return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current ); + } + return true; +} bool Reader::decodeUnicodeEscapeSequence( Token &token, diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 9f2145a..111caac 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include #if _MSC_VER >= 1400 // VC++ 8.0 #pragma warning( disable : 4996 ) // disable warning about strdup being deprecated. @@ -11,6 +13,20 @@ namespace Json { +static bool isControlCharacter(char ch) +{ + return ch > 0 && ch <= 0x1F; +} + +static bool containsControlCharacter( const char* str ) +{ + while ( str ) + { + if ( isControlCharacter( *(str++) ) ) + return true; + } + return false; +} static void uintToString( unsigned int value, char *¤t ) { @@ -95,7 +111,7 @@ std::string valueToString( bool value ) std::string valueToQuotedString( const char *value ) { // Not sure how to handle unicode... - if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL) + if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL && !containsControlCharacter( value )) return std::string("\"") + value + "\""; // We have to walk value and escape any special characters. // Appending to std::string is not efficient, but this should be rare. @@ -132,8 +148,16 @@ std::string valueToQuotedString( const char *value ) // slash is also legal, so I see no reason to escape it. // (I hope I am not misunderstanding something.) default: - result += *c; + if ( isControlCharacter( *c ) ) + { + std::ostringstream oss; + oss << "\\u" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << static_cast(*c); + result += oss.str(); + } + else + result += *c; } + break; } result += "\""; return result; diff --git a/test/test_string_unicode_01.expected b/test/test_string_unicode_01.expected new file mode 100644 index 0000000..447f85a --- /dev/null +++ b/test/test_string_unicode_01.expected @@ -0,0 +1 @@ +.="a" diff --git a/test/test_string_unicode_01.json b/test/test_string_unicode_01.json new file mode 100644 index 0000000..024114b --- /dev/null +++ b/test/test_string_unicode_01.json @@ -0,0 +1 @@ +"\u0061" \ No newline at end of file diff --git a/test/test_string_unicode_02.expected b/test/test_string_unicode_02.expected new file mode 100644 index 0000000..c0b3b43 --- /dev/null +++ b/test/test_string_unicode_02.expected @@ -0,0 +1 @@ +.="¢" diff --git a/test/test_string_unicode_02.json b/test/test_string_unicode_02.json new file mode 100644 index 0000000..4961024 --- /dev/null +++ b/test/test_string_unicode_02.json @@ -0,0 +1 @@ +"\u00A2" \ No newline at end of file diff --git a/test/test_string_unicode_03.expected b/test/test_string_unicode_03.expected new file mode 100644 index 0000000..7289743 --- /dev/null +++ b/test/test_string_unicode_03.expected @@ -0,0 +1 @@ +.="€" diff --git a/test/test_string_unicode_03.json b/test/test_string_unicode_03.json new file mode 100644 index 0000000..e7e1a9e --- /dev/null +++ b/test/test_string_unicode_03.json @@ -0,0 +1 @@ +"\u20AC" \ No newline at end of file diff --git a/test/test_string_unicode_04.expected b/test/test_string_unicode_04.expected new file mode 100644 index 0000000..868fbc3 --- /dev/null +++ b/test/test_string_unicode_04.expected @@ -0,0 +1 @@ +.="𝄞" diff --git a/test/test_string_unicode_04.json b/test/test_string_unicode_04.json new file mode 100644 index 0000000..dae65c5 --- /dev/null +++ b/test/test_string_unicode_04.json @@ -0,0 +1 @@ +"\uD834\uDD1E" \ No newline at end of file