Implementing support for reading and writing Unicode escape sequences.

2025-10-14 15:05:34 +02:00 · 2009-11-13 04:21:14 +00:00
parent 060c45a38d
commit ee4b4dab54
11 changed files with 105 additions and 5 deletions
--- a/include/json/reader.h
+++ b/include/json/reader.h
@@ -115,6 +115,10 @@ namespace Json {
      bool decodeString( Token &token );
      bool decodeString( Token &token, std::string &decoded );
      bool decodeDouble( Token &token );
      bool decodeUnicodeCodePoint( Token &token, 
                                   Location &current, 
                                   Location end, 
                                   unsigned int &unicode );
      bool decodeUnicodeEscapeSequence( Token &token, 
                                        Location &current, 
                                        Location end, 
--- a/src/lib_json/json_reader.cpp
+++ b/src/lib_json/json_reader.cpp
@@ -36,6 +36,42 @@ containsNewLine( Reader::Location begin,
   return false;
 }
 static std::string codePointToUTF8(unsigned int cp)
 {
   std::string result;
   // based on description from http://en.wikipedia.org/wiki/UTF-8
   if (cp <= 0x7f) 
   {
      result.resize(1);
      result[0] = static_cast<char>(cp);
   } 
   else if (cp <= 0x7FF) 
   {
      result.resize(2);
      result[1] = static_cast<char>(0x80 | (0x3f & cp));
      result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
   } 
   else if (cp <= 0xFFFF) 
   {
      result.resize(3);
      result[2] = static_cast<char>(0x80 | (0x3f & cp));
      result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
      result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
   }
   else if (cp <= 0x10FFFF) 
   {
      result.resize(4);
      result[3] = static_cast<char>(0x80 | (0x3f & cp));
      result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
      result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
      result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
   }
   return result;
 }
 // Class Reader
 // //////////////////////////////////////////////////////////////////
@@ -577,10 +613,9 @@ Reader::decodeString( Token &token, std::string &decoded )
         case 'u':
            {
               unsigned int unicode;
-               if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
+               if ( !decodeUnicodeCodePoint( token, current, end, unicode ) )
                  return false;
-               // @todo encode unicode as utf8.
+               decoded += codePointToUTF8(unicode);
 	       // @todo remember to alter the writer too.
            }
            break;
         default:
@@ -595,6 +630,35 @@ Reader::decodeString( Token &token, std::string &decoded )
   return true;
 }
 bool
 Reader::decodeUnicodeCodePoint( Token &token, 
                                     Location &current, 
                                     Location end, 
                                     unsigned int &unicode )
 {
   if ( !decodeUnicodeEscapeSequence( token, current, end, unicode ) )
      return false;
   if (unicode >= 0xD800 && unicode <= 0xDBFF)
   {
      // surrogate pairs
      if (end - current < 6)
         return addError( "additional six characters expected to parse unicode surrogate pair.", token, current );
      unsigned int surrogatePair;
      if (*(current++) == '\\' && *(current++)== 'u')
      {
         if (decodeUnicodeEscapeSequence( token, current, end, surrogatePair ))
         {
            unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
         } 
         else
            return false;
      } 
      else
         return addError( "expecting another \\u token to begin the second half of a unicode surrogate pair", token, current );
   }
   return true;
 }
 bool 
 Reader::decodeUnicodeEscapeSequence( Token &token, 
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -4,6 +4,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <sstream>
 #include <iomanip>
 #if _MSC_VER >= 1400 // VC++ 8.0
 #pragma warning( disable : 4996 )   // disable warning about strdup being deprecated.
@@ -11,6 +13,20 @@
 namespace Json {
 static bool isControlCharacter(char ch)
 {
   return ch > 0 && ch <= 0x1F;
 }
 static bool containsControlCharacter( const char* str )
 {
   while ( str ) 
   {
      if ( isControlCharacter( *(str++) ) )
         return true;
   }
   return false;
 }
 static void uintToString( unsigned int value, 
                          char *&current )
 {
@@ -95,7 +111,7 @@ std::string valueToString( bool value )
 std::string valueToQuotedString( const char *value )
 {
   // Not sure how to handle unicode...
-   if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL)
+   if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL && !containsControlCharacter( value ))
      return std::string("\"") + value + "\"";
   // We have to walk value and escape any special characters.
   // Appending to std::string is not efficient, but this should be rare.
@@ -132,8 +148,16 @@ std::string valueToQuotedString( const char *value )
 	 // slash is also legal, so I see no reason to escape it.
 	 // (I hope I am not misunderstanding something.)
 	 default:
-	    result += *c;
+	    if ( isControlCharacter( *c ) )
       {
          std::ostringstream oss;
          oss << "\\u" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << static_cast<int>(*c);
          result += oss.str();
       }
       else
         result += *c;
      }
      break;
   }
   result += "\"";
   return result;
--- a/test/test_string_unicode_01.expected
+++ b/test/test_string_unicode_01.expected
@@ -0,0 +1 @@
 .="a"
--- a/test/test_string_unicode_01.json
+++ b/test/test_string_unicode_01.json
@@ -0,0 +1 @@
 "\u0061"
--- a/test/test_string_unicode_02.expected
+++ b/test/test_string_unicode_02.expected
@@ -0,0 +1 @@
 .="¢"
--- a/test/test_string_unicode_02.json
+++ b/test/test_string_unicode_02.json
@@ -0,0 +1 @@
 "\u00A2"
--- a/test/test_string_unicode_03.expected
+++ b/test/test_string_unicode_03.expected
@@ -0,0 +1 @@
 .="€"
--- a/test/test_string_unicode_03.json
+++ b/test/test_string_unicode_03.json
@@ -0,0 +1 @@
 "\u20AC"
--- a/test/test_string_unicode_04.expected
+++ b/test/test_string_unicode_04.expected
@@ -0,0 +1 @@
 .="𝄞"
--- a/test/test_string_unicode_04.json
+++ b/test/test_string_unicode_04.json
@@ -0,0 +1 @@
 "\uD834\uDD1E"