diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 802f96c..0852b48 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -81,24 +81,6 @@ typedef std::unique_ptr StreamWriterPtr; typedef std::auto_ptr StreamWriterPtr; #endif -static bool containsControlCharacter(const char* str) { - while (*str) { - if (isControlCharacter(*(str++))) - return true; - } - return false; -} - -static bool containsControlCharacter0(const char* str, unsigned len) { - char const* end = str + len; - while (end != str) { - if (isControlCharacter(*str) || 0==*str) - return true; - ++str; - } - return false; -} - JSONCPP_STRING valueToString(LargestInt value) { UIntToStringBuffer buffer; char* current = buffer + sizeof(buffer); @@ -176,89 +158,103 @@ JSONCPP_STRING valueToString(double value) { return valueToString(value, false, JSONCPP_STRING valueToString(bool value) { return value ? "true" : "false"; } -JSONCPP_STRING valueToQuotedString(const char* value) { - if (value == NULL) - return ""; - // Not sure how to handle unicode... - if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL && - !containsControlCharacter(value)) - return JSONCPP_STRING("\"") + value + "\""; - // We have to walk value and escape any special characters. - // Appending to JSONCPP_STRING is not efficient, but this should be rare. - // (Note: forward slashes are *not* rare, but I am not escaping them.) - JSONCPP_STRING::size_type maxsize = - strlen(value) * 2 + 3; // allescaped+quotes+NULL - JSONCPP_STRING result; - result.reserve(maxsize); // to avoid lots of mallocs - result += "\""; - for (const char* c = value; *c != 0; ++c) { - switch (*c) { - case '\"': - result += "\\\""; - break; - case '\\': - result += "\\\\"; - break; - case '\b': - result += "\\b"; - break; - case '\f': - result += "\\f"; - break; - case '\n': - result += "\\n"; - break; - case '\r': - result += "\\r"; - break; - case '\t': - result += "\\t"; - break; - // case '/': - // Even though \/ is considered a legal escape in JSON, a bare - // slash is also legal, so I see no reason to escape it. - // (I hope I am not misunderstanding something. - // blep notes: actually escaping \/ may be useful in javascript to avoid (*c); - result += oss.str(); - } else { - result += *c; - } - break; - } - } - result += "\""; - return result; -} - -// https://github.com/upcaste/upcaste/blob/master/src/upcore/src/cstring/strnpbrk.cpp -static char const* strnpbrk(char const* s, char const* accept, size_t n) { - assert((s || !n) && accept); +static bool isAnyCharRequiredQuoting(char const* s, size_t n) { + assert(s || !n); char const* const end = s + n; for (char const* cur = s; cur < end; ++cur) { - int const c = *cur; - for (char const* a = accept; *a; ++a) { - if (*a == c) { - return cur; - } - } + if (*cur == '\\' || *cur == '\"' || *cur < ' ' + || static_cast(*cur) < 0x80) + return true; } - return NULL; + return false; } + +static unsigned int utf8ToCodepoint(const char*& s, const char* e) { + const unsigned int REPLACEMENT_CHARACTER = 0xFFFD; + + unsigned int firstByte = static_cast(*s); + + if (firstByte < 0x80) + return firstByte; + + if (firstByte < 0xE0) { + if (e - s < 2) + return REPLACEMENT_CHARACTER; + + unsigned int calculated = ((firstByte & 0x1F) << 6) + | (static_cast(s[1]) & 0x3F); + s += 1; + // oversized encoded characters are invalid + return calculated < 0x80 ? REPLACEMENT_CHARACTER : calculated; + } + + if (firstByte < 0xF0) { + if (e - s < 3) + return REPLACEMENT_CHARACTER; + + unsigned int calculated = ((firstByte & 0x0F) << 12) + | ((static_cast(s[1]) & 0x3F) << 6) + | (static_cast(s[2]) & 0x3F); + s += 2; + // surrogates aren't valid codepoints itself + // shouldn't be UTF-8 encoded + if (calculated >= 0xD800 && calculated >= 0xDFFF) + return REPLACEMENT_CHARACTER; + // oversized encoded characters are invalid + return calculated < 0x800 ? REPLACEMENT_CHARACTER : calculated; + } + + if (firstByte < 0xF8) { + if (e - s < 4) + return REPLACEMENT_CHARACTER; + + unsigned int calculated = ((firstByte & 0x07) << 24) + | ((static_cast(s[1]) & 0x3F) << 12) + | ((static_cast(s[2]) & 0x3F) << 6) + | (static_cast(s[3]) & 0x3F); + s += 3; + // oversized encoded characters are invalid + return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated; + } + + return REPLACEMENT_CHARACTER; +} + +static const char hex2[] = + "000102030405060708090a0b0c0d0e0f" + "101112131415161718191a1b1c1d1e1f" + "202122232425262728292a2b2c2d2e2f" + "303132333435363738393a3b3c3d3e3f" + "404142434445464748494a4b4c4d4e4f" + "505152535455565758595a5b5c5d5e5f" + "606162636465666768696a6b6c6d6e6f" + "707172737475767778797a7b7c7d7e7f" + "808182838485868788898a8b8c8d8e8f" + "909192939495969798999a9b9c9d9e9f" + "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf" + "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf" + "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf" + "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf" + "e0e1e2e3e4e5e6e7e8e9eaebecedeeef" + "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"; + +static JSONCPP_STRING toHex16Bit(unsigned int x) { + const unsigned int hi = (x >> 8) & 0xff; + const unsigned int lo = x & 0xff; + JSONCPP_STRING result(4, ' '); + result[0] = hex2[2 * hi]; + result[1] = hex2[2 * hi + 1]; + result[2] = hex2[2 * lo]; + result[3] = hex2[2 * lo + 1]; + return result; +} + static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) { if (value == NULL) return ""; - // Not sure how to handle unicode... - if (strnpbrk(value, "\"\\\b\f\n\r\t", length) == NULL && - !containsControlCharacter0(value, length)) + + if (!isAnyCharRequiredQuoting(value, length)) return JSONCPP_STRING("\"") + value + "\""; // We have to walk value and escape any special characters. // Appending to JSONCPP_STRING is not efficient, but this should be rare. @@ -300,14 +296,24 @@ static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) { // sequence. // Should add a flag to allow this compatibility mode and prevent this // sequence from occurring. - default: - if ((isControlCharacter(*c)) || (*c == 0)) { - JSONCPP_OSTRINGSTREAM oss; - oss << "\\u" << std::hex << std::uppercase << std::setfill('0') - << std::setw(4) << static_cast(*c); - result += oss.str(); - } else { - result += *c; + default: { + unsigned int cp = utf8ToCodepoint(c, end); + // don't escape non-control characters + // (short escape sequence are applied above) + if (cp < 0x80 && cp >= 0x20) + result += static_cast(cp); + else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane + result += "\\u"; + result += toHex16Bit(cp); + } + else { // codepoint is not in Basic Multilingual Plane + // convert to surrogate pair first + cp -= 0x10000; + result += "\\u"; + result += toHex16Bit((cp >> 10) + 0xD800); + result += "\\u"; + result += toHex16Bit((cp & 0x3FF) + 0xDC00); + } } break; } @@ -316,6 +322,10 @@ static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) { return result; } +JSONCPP_STRING valueToQuotedString(const char* value) { + return valueToQuotedStringN(value, static_cast(strlen(value))); +} + // Class Writer // ////////////////////////////////////////////////////////////////// Writer::~Writer() {}