Added emitUTF8 setting. (#1045)

* Added emitUTF8 setting to emit UTF8 format JSON.

* Added a test for emitUTF8, with it in default, on and off states.

* Review comments addressed.

* Merged master into my branch & resolved conflicts.

* Fix clang-format errors.

* Fix clang-format errors.

* Fixed clang-format errors.

* Fixed clang-format errors.
This commit is contained in:
nicolaswilson 2019-10-17 18:47:51 +01:00 committed by Jordan Bayles
parent f59ac2a1d7
commit a955529e47
2 changed files with 75 additions and 29 deletions

View File

@ -264,7 +264,8 @@ static String toHex16Bit(unsigned int x) {
return result;
}
static String valueToQuotedStringN(const char* value, unsigned length) {
static String valueToQuotedStringN(const char* value, unsigned length,
bool emitUTF8 = false) {
if (value == nullptr)
return "";
@ -310,21 +311,31 @@ static String valueToQuotedStringN(const char* value, unsigned length) {
// Should add a flag to allow this compatibility mode and prevent this
// sequence from occurring.
default: {
unsigned int cp = utf8ToCodepoint(c, end);
// don't escape non-control characters
// (short escape sequence are applied above)
if (cp < 0x80 && cp >= 0x20)
result += static_cast<char>(cp);
else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
result += "\\u";
result += toHex16Bit(cp);
} else { // codepoint is not in Basic Multilingual Plane
// convert to surrogate pair first
cp -= 0x10000;
result += "\\u";
result += toHex16Bit((cp >> 10) + 0xD800);
result += "\\u";
result += toHex16Bit((cp & 0x3FF) + 0xDC00);
if (emitUTF8) {
result += *c;
} else {
unsigned int codepoint = utf8ToCodepoint(c, end);
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
// don't escape non-control characters
// (short escape sequence are applied above)
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
result += static_cast<char>(codepoint);
} else if (codepoint <
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
// Multilingual Plane
result += "\\u";
result += toHex16Bit(codepoint);
} else { // codepoint is not in Basic Multilingual Plane
// convert to surrogate pair first
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
result += "\\u";
result += toHex16Bit((codepoint >> 10) + 0xD800);
result += "\\u";
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
}
}
} break;
}
@ -864,7 +875,8 @@ struct BuiltStyledStreamWriter : public StreamWriter {
BuiltStyledStreamWriter(String indentation, CommentStyle::Enum cs,
String colonSymbol, String nullSymbol,
String endingLineFeedSymbol, bool useSpecialFloats,
unsigned int precision, PrecisionType precisionType);
bool emitUTF8, unsigned int precision,
PrecisionType precisionType);
int write(Value const& root, OStream* sout) override;
private:
@ -893,19 +905,20 @@ private:
bool addChildValues_ : 1;
bool indented_ : 1;
bool useSpecialFloats_ : 1;
bool emitUTF8_ : 1;
unsigned int precision_;
PrecisionType precisionType_;
};
BuiltStyledStreamWriter::BuiltStyledStreamWriter(
String indentation, CommentStyle::Enum cs, String colonSymbol,
String nullSymbol, String endingLineFeedSymbol, bool useSpecialFloats,
unsigned int precision, PrecisionType precisionType)
bool emitUTF8, unsigned int precision, PrecisionType precisionType)
: rightMargin_(74), indentation_(std::move(indentation)), cs_(cs),
colonSymbol_(std::move(colonSymbol)), nullSymbol_(std::move(nullSymbol)),
endingLineFeedSymbol_(std::move(endingLineFeedSymbol)),
addChildValues_(false), indented_(false),
useSpecialFloats_(useSpecialFloats), precision_(precision),
precisionType_(precisionType) {}
useSpecialFloats_(useSpecialFloats), emitUTF8_(emitUTF8),
precision_(precision), precisionType_(precisionType) {}
int BuiltStyledStreamWriter::write(Value const& root, OStream* sout) {
sout_ = sout;
addChildValues_ = false;
@ -942,7 +955,8 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) {
char const* end;
bool ok = value.getString(&str, &end);
if (ok)
pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str)));
pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str),
emitUTF8_));
else
pushValue("");
break;
@ -966,7 +980,7 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) {
Value const& childValue = value[name];
writeCommentBeforeValue(childValue);
writeWithIndent(valueToQuotedStringN(
name.data(), static_cast<unsigned>(name.length())));
name.data(), static_cast<unsigned>(name.length()), emitUTF8_));
*sout_ << colonSymbol_;
writeValue(childValue);
if (++it == members.end()) {
@ -1142,12 +1156,13 @@ StreamWriter::Factory::~Factory() = default;
StreamWriterBuilder::StreamWriterBuilder() { setDefaults(&settings_); }
StreamWriterBuilder::~StreamWriterBuilder() = default;
StreamWriter* StreamWriterBuilder::newStreamWriter() const {
String indentation = settings_["indentation"].asString();
String cs_str = settings_["commentStyle"].asString();
String pt_str = settings_["precisionType"].asString();
bool eyc = settings_["enableYAMLCompatibility"].asBool();
bool dnp = settings_["dropNullPlaceholders"].asBool();
bool usf = settings_["useSpecialFloats"].asBool();
const String indentation = settings_["indentation"].asString();
const String cs_str = settings_["commentStyle"].asString();
const String pt_str = settings_["precisionType"].asString();
const bool eyc = settings_["enableYAMLCompatibility"].asBool();
const bool dnp = settings_["dropNullPlaceholders"].asBool();
const bool usf = settings_["useSpecialFloats"].asBool();
const bool emitUTF8 = settings_["emitUTF8"].asBool();
unsigned int pre = settings_["precision"].asUInt();
CommentStyle::Enum cs = CommentStyle::All;
if (cs_str == "All") {
@ -1179,7 +1194,7 @@ StreamWriter* StreamWriterBuilder::newStreamWriter() const {
pre = 17;
String endingLineFeedSymbol;
return new BuiltStyledStreamWriter(indentation, cs, colonSymbol, nullSymbol,
endingLineFeedSymbol, usf, pre,
endingLineFeedSymbol, usf, emitUTF8, pre,
precisionType);
}
static void getValidWriterKeys(std::set<String>* valid_keys) {
@ -1189,6 +1204,7 @@ static void getValidWriterKeys(std::set<String>* valid_keys) {
valid_keys->insert("enableYAMLCompatibility");
valid_keys->insert("dropNullPlaceholders");
valid_keys->insert("useSpecialFloats");
valid_keys->insert("emitUTF8");
valid_keys->insert("precision");
valid_keys->insert("precisionType");
}
@ -1220,6 +1236,7 @@ void StreamWriterBuilder::setDefaults(Json::Value* settings) {
(*settings)["enableYAMLCompatibility"] = false;
(*settings)["dropNullPlaceholders"] = false;
(*settings)["useSpecialFloats"] = false;
(*settings)["emitUTF8"] = false;
(*settings)["precision"] = 17;
(*settings)["precisionType"] = "significant";
//! [StreamWriterBuilderDefaults]

View File

@ -2481,6 +2481,35 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, writeZeroes) {
}
}
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
// Create a Json value containing UTF-8 string with some chars that need
// escape (tab,newline).
Json::Value root;
root["test"] = "\t\n\xF0\x91\xA2\xA1\x3D\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7";
Json::StreamWriterBuilder b;
// Default settings - should be unicode escaped.
JSONTEST_ASSERT(Json::writeString(b, root) ==
"{\n\t\"test\" : "
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
b.settings_["emitUTF8"] = true;
// Should not be unicode escaped.
JSONTEST_ASSERT(
Json::writeString(b, root) ==
"{\n\t\"test\" : "
"\"\\t\\n\xF0\x91\xA2\xA1=\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7\"\n}");
b.settings_["emitUTF8"] = false;
// Should be unicode escaped.
JSONTEST_ASSERT(Json::writeString(b, root) ==
"{\n\t\"test\" : "
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
}
struct ReaderTest : JsonTest::TestCase {};
JSONTEST_FIXTURE_LOCAL(ReaderTest, parseWithNoErrors) {