From a955529e474e819319023a9d456a1c9bdc745f31 Mon Sep 17 00:00:00 2001 From: nicolaswilson <56082512+nicolaswilson@users.noreply.github.com> Date: Thu, 17 Oct 2019 18:47:51 +0100 Subject: [PATCH] Added emitUTF8 setting. (#1045) * Added emitUTF8 setting to emit UTF8 format JSON. * Added a test for emitUTF8, with it in default, on and off states. * Review comments addressed. * Merged master into my branch & resolved conflicts. * Fix clang-format errors. * Fix clang-format errors. * Fixed clang-format errors. * Fixed clang-format errors. --- src/lib_json/json_writer.cpp | 75 ++++++++++++++++++++++-------------- src/test_lib_json/main.cpp | 29 ++++++++++++++ 2 files changed, 75 insertions(+), 29 deletions(-) diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index e16d84f..519ce23 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -264,7 +264,8 @@ static String toHex16Bit(unsigned int x) { return result; } -static String valueToQuotedStringN(const char* value, unsigned length) { +static String valueToQuotedStringN(const char* value, unsigned length, + bool emitUTF8 = false) { if (value == nullptr) return ""; @@ -310,21 +311,31 @@ static String valueToQuotedStringN(const char* value, unsigned length) { // Should add a flag to allow this compatibility mode and prevent this // sequence from occurring. default: { - unsigned int cp = utf8ToCodepoint(c, end); - // don't escape non-control characters - // (short escape sequence are applied above) - if (cp < 0x80 && cp >= 0x20) - result += static_cast(cp); - else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane - result += "\\u"; - result += toHex16Bit(cp); - } else { // codepoint is not in Basic Multilingual Plane - // convert to surrogate pair first - cp -= 0x10000; - result += "\\u"; - result += toHex16Bit((cp >> 10) + 0xD800); - result += "\\u"; - result += toHex16Bit((cp & 0x3FF) + 0xDC00); + if (emitUTF8) { + result += *c; + } else { + unsigned int codepoint = utf8ToCodepoint(c, end); + const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; + const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F; + const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000; + // don't escape non-control characters + // (short escape sequence are applied above) + if (FIRST_NON_CONTROL_CODEPOINT <= codepoint && + codepoint <= LAST_NON_CONTROL_CODEPOINT) { + result += static_cast(codepoint); + } else if (codepoint < + FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic + // Multilingual Plane + result += "\\u"; + result += toHex16Bit(codepoint); + } else { // codepoint is not in Basic Multilingual Plane + // convert to surrogate pair first + codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT; + result += "\\u"; + result += toHex16Bit((codepoint >> 10) + 0xD800); + result += "\\u"; + result += toHex16Bit((codepoint & 0x3FF) + 0xDC00); + } } } break; } @@ -864,7 +875,8 @@ struct BuiltStyledStreamWriter : public StreamWriter { BuiltStyledStreamWriter(String indentation, CommentStyle::Enum cs, String colonSymbol, String nullSymbol, String endingLineFeedSymbol, bool useSpecialFloats, - unsigned int precision, PrecisionType precisionType); + bool emitUTF8, unsigned int precision, + PrecisionType precisionType); int write(Value const& root, OStream* sout) override; private: @@ -893,19 +905,20 @@ private: bool addChildValues_ : 1; bool indented_ : 1; bool useSpecialFloats_ : 1; + bool emitUTF8_ : 1; unsigned int precision_; PrecisionType precisionType_; }; BuiltStyledStreamWriter::BuiltStyledStreamWriter( String indentation, CommentStyle::Enum cs, String colonSymbol, String nullSymbol, String endingLineFeedSymbol, bool useSpecialFloats, - unsigned int precision, PrecisionType precisionType) + bool emitUTF8, unsigned int precision, PrecisionType precisionType) : rightMargin_(74), indentation_(std::move(indentation)), cs_(cs), colonSymbol_(std::move(colonSymbol)), nullSymbol_(std::move(nullSymbol)), endingLineFeedSymbol_(std::move(endingLineFeedSymbol)), addChildValues_(false), indented_(false), - useSpecialFloats_(useSpecialFloats), precision_(precision), - precisionType_(precisionType) {} + useSpecialFloats_(useSpecialFloats), emitUTF8_(emitUTF8), + precision_(precision), precisionType_(precisionType) {} int BuiltStyledStreamWriter::write(Value const& root, OStream* sout) { sout_ = sout; addChildValues_ = false; @@ -942,7 +955,8 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) { char const* end; bool ok = value.getString(&str, &end); if (ok) - pushValue(valueToQuotedStringN(str, static_cast(end - str))); + pushValue(valueToQuotedStringN(str, static_cast(end - str), + emitUTF8_)); else pushValue(""); break; @@ -966,7 +980,7 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) { Value const& childValue = value[name]; writeCommentBeforeValue(childValue); writeWithIndent(valueToQuotedStringN( - name.data(), static_cast(name.length()))); + name.data(), static_cast(name.length()), emitUTF8_)); *sout_ << colonSymbol_; writeValue(childValue); if (++it == members.end()) { @@ -1142,12 +1156,13 @@ StreamWriter::Factory::~Factory() = default; StreamWriterBuilder::StreamWriterBuilder() { setDefaults(&settings_); } StreamWriterBuilder::~StreamWriterBuilder() = default; StreamWriter* StreamWriterBuilder::newStreamWriter() const { - String indentation = settings_["indentation"].asString(); - String cs_str = settings_["commentStyle"].asString(); - String pt_str = settings_["precisionType"].asString(); - bool eyc = settings_["enableYAMLCompatibility"].asBool(); - bool dnp = settings_["dropNullPlaceholders"].asBool(); - bool usf = settings_["useSpecialFloats"].asBool(); + const String indentation = settings_["indentation"].asString(); + const String cs_str = settings_["commentStyle"].asString(); + const String pt_str = settings_["precisionType"].asString(); + const bool eyc = settings_["enableYAMLCompatibility"].asBool(); + const bool dnp = settings_["dropNullPlaceholders"].asBool(); + const bool usf = settings_["useSpecialFloats"].asBool(); + const bool emitUTF8 = settings_["emitUTF8"].asBool(); unsigned int pre = settings_["precision"].asUInt(); CommentStyle::Enum cs = CommentStyle::All; if (cs_str == "All") { @@ -1179,7 +1194,7 @@ StreamWriter* StreamWriterBuilder::newStreamWriter() const { pre = 17; String endingLineFeedSymbol; return new BuiltStyledStreamWriter(indentation, cs, colonSymbol, nullSymbol, - endingLineFeedSymbol, usf, pre, + endingLineFeedSymbol, usf, emitUTF8, pre, precisionType); } static void getValidWriterKeys(std::set* valid_keys) { @@ -1189,6 +1204,7 @@ static void getValidWriterKeys(std::set* valid_keys) { valid_keys->insert("enableYAMLCompatibility"); valid_keys->insert("dropNullPlaceholders"); valid_keys->insert("useSpecialFloats"); + valid_keys->insert("emitUTF8"); valid_keys->insert("precision"); valid_keys->insert("precisionType"); } @@ -1220,6 +1236,7 @@ void StreamWriterBuilder::setDefaults(Json::Value* settings) { (*settings)["enableYAMLCompatibility"] = false; (*settings)["dropNullPlaceholders"] = false; (*settings)["useSpecialFloats"] = false; + (*settings)["emitUTF8"] = false; (*settings)["precision"] = 17; (*settings)["precisionType"] = "significant"; //! [StreamWriterBuilderDefaults] diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index f32a11f..326519f 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -2481,6 +2481,35 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, writeZeroes) { } } +JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) { + // Create a Json value containing UTF-8 string with some chars that need + // escape (tab,newline). + Json::Value root; + root["test"] = "\t\n\xF0\x91\xA2\xA1\x3D\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7"; + + Json::StreamWriterBuilder b; + + // Default settings - should be unicode escaped. + JSONTEST_ASSERT(Json::writeString(b, root) == + "{\n\t\"test\" : " + "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}"); + + b.settings_["emitUTF8"] = true; + + // Should not be unicode escaped. + JSONTEST_ASSERT( + Json::writeString(b, root) == + "{\n\t\"test\" : " + "\"\\t\\n\xF0\x91\xA2\xA1=\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7\"\n}"); + + b.settings_["emitUTF8"] = false; + + // Should be unicode escaped. + JSONTEST_ASSERT(Json::writeString(b, root) == + "{\n\t\"test\" : " + "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}"); +} + struct ReaderTest : JsonTest::TestCase {}; JSONTEST_FIXTURE_LOCAL(ReaderTest, parseWithNoErrors) {