GH #176: Poco::JSON::Stringifier UTF encoding

2025-04-18 15:33:08 +02:00 · 2014-05-27 22:23:10 -05:00 · 2014-05-27 22:23:10 -05:00 · b2eb4fda59
commit b2eb4fda59
parent 1732938168
6 changed files with 105 additions and 86 deletions
--- a/JSON/include/Poco/JSON/Object.h
+++ b/JSON/include/Poco/JSON/Object.h
@ -221,7 +221,7 @@ private:
 		{
 			for(unsigned int i = 0; i < indent; i++) out << ' ';

-			out << '"' << getKey(it) << '"';
+			Stringifier::stringify(getKey(it), out);
 			out << ((indent > 0) ? " : " : ":");

 			Stringifier::stringify(getValue(it), out, indent + step, step);
--- a/JSON/include/Poco/JSON/Parser.h
+++ b/JSON/include/Poco/JSON/Parser.h
@ -602,6 +602,10 @@ private:
 	static const int _stateTransitionTable[NR_STATES][NR_CLASSES];
 	static const int xx = -1;

+	bool isHighSurrogate(unsigned uc);
+	bool isLowSurrogate(unsigned uc);
+	unsigned decodeSurrogatePair(unsigned hi, unsigned lo);
+
 	Handler::Ptr   _pHandler;
 	signed char    _state;
 	signed char    _beforeCommentState;
@ -713,6 +717,24 @@ inline void Parser::growBuffer()
 }


+inline bool Parser::isHighSurrogate(unsigned uc)
+{
+	return (uc & 0xFC00) == 0xD800;
+}
+
+
+inline bool Parser::isLowSurrogate(unsigned uc)
+{
+	return (uc & 0xFC00) == 0xDC00;
+}
+
+
+inline unsigned Parser::decodeSurrogatePair(unsigned hi, unsigned lo)
+{
+	return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
+}
+
+
 }} // namespace Poco::JSON


--- a/JSON/src/Parser.cpp
+++ b/JSON/src/Parser.cpp
@ -30,17 +30,7 @@ namespace Poco {
 namespace JSON {


-#ifndef IS_HIGH_SURROGATE
-	#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
-#endif
-#ifndef IS_LOW_SURROGATE
-	#define IS_LOW_SURROGATE(uc)  (((uc) & 0xFC00) == 0xDC00)
-#endif
-#ifndef DECODE_SURROGATE_PAIR
-	#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
-#endif
-#define COUNTOF(x) (sizeof(x)/sizeof(x[0])) 
-static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
+static const unsigned char UTF8_LEAD_BITS[4] = { 0x00, 0xC0, 0xE0, 0xF0 };


 const int Parser::_asciiClass[] = {
@ -245,6 +235,7 @@ void Parser::addEscapedCharToParseBuffer(CharIntType nextChar)
 	_escaped = 0;
 	// remove the backslash
 	parseBufferPopBackChar();
+
 	switch(nextChar)
 	{
 	case 'b':
@ -304,44 +295,38 @@ Parser::CharIntType Parser::decodeUnicodeChar()
 	int i;
 	unsigned uc = 0;
 	char* p;
-	int trail_bytes;
+	int trailBytes;

 	poco_assert(_parseBuffer.size() >= 6);
 	p = &_parseBuffer[_parseBuffer.size() - 4];

-	for (i = 12; i >= 0; i -= 4, ++p) {
+	for (i = 12; i >= 0; i -= 4, ++p)
+	{
 		unsigned x = *p;

-		if (x >= 'a') {
-			x -= ('a' - 10);
-		} else if (x >= 'A') {
-			x -= ('A' - 10);
-		} else {
-			x &= ~0x30u;
-		}
+		if (x >= 'a')      x -= ('a' - 10);
+		else if (x >= 'A') x -= ('A' - 10);
+		else               x &= ~0x30u;

 		poco_assert(x < 16);
-
 		uc |= x << i;
 	}

-	if ( !_allowNullByte && uc == 0 ) return 0; // Null byte not allowed
+	if ( !_allowNullByte && uc == 0 ) return 0; 

 	// clear UTF-16 char from buffer
 	_parseBuffer.resize(_parseBuffer.size() - 6);

-	// attempt decoding 
 	if (_utf16HighSurrogate)
 	{
-		if (IS_LOW_SURROGATE(uc))
+		if (isLowSurrogate(uc))
 		{
-			uc = DECODE_SURROGATE_PAIR(_utf16HighSurrogate, uc);
-			trail_bytes = 3;
+			uc = decodeSurrogatePair(_utf16HighSurrogate, uc);
+			trailBytes = 3;
 			_utf16HighSurrogate = 0;
 		}
-		else
+		else // high surrogate without a following low surrogate
 		{
-			// high surrogate without a following low surrogate
 			return 0;
 		}
 	}
@ -349,32 +334,32 @@ Parser::CharIntType Parser::decodeUnicodeChar()
 	{
 		if (uc < 0x80)
 		{
-			trail_bytes = 0;
+			trailBytes = 0;
 		}
 		else if (uc < 0x800)
 		{
-			trail_bytes = 1;
+			trailBytes = 1;
 		}
-		else if (IS_HIGH_SURROGATE(uc))
+		else if (isHighSurrogate(uc))
 		{
 			// save the high surrogate and wait for the low surrogate
 			_utf16HighSurrogate = uc;
 			return 1;
 		}
-		else if (IS_LOW_SURROGATE(uc))
+		else if (isLowSurrogate(uc))
 		{
 			// low surrogate without a preceding high surrogate 
 			return 0;
 		}
 		else
 		{
-			trail_bytes = 2;
+			trailBytes = 2;
 		}
 	}

-	_parseBuffer.append((char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]));
+	_parseBuffer.append((char) ((uc >> (trailBytes * 6)) | UTF8_LEAD_BITS[trailBytes]));

-	for (i = trail_bytes * 6 - 6; i >= 0; i -= 6)
+	for (i = trailBytes * 6 - 6; i >= 0; i -= 6)
 	{
 		_parseBuffer.append((char) (((uc >> i) & 0x3F) | 0x80));
 	}
--- a/JSON/src/PrintHandler.cpp
+++ b/JSON/src/PrintHandler.cpp
@ -118,7 +118,8 @@ void PrintHandler::key(const std::string& k)
 		comma();
 		_value = false;
 	}
-	_out << _tab << '"' << k << '"';
+	_out << _tab;
+	Stringifier::formatString(k, _out);
 	if (!printFlat()) _out << ' ';
 	_out << ':';
 	if (!printFlat()) _out << ' ';
--- a/JSON/src/Stringifier.cpp
+++ b/JSON/src/Stringifier.cpp
@ -72,44 +72,11 @@ void Stringifier::formatString(const std::string& value, std::ostream& out)
 	out << '"';
 	for (std::string::const_iterator it = value.begin(); it != value.end(); ++it)
 	{
-		if (*it == 0x20 ||
-			*it == 0x21 ||
-			(*it >= 0x23 && *it <= 0x2E) ||
-			(*it >= 0x30 && *it <= 0x5B) ||
-			(*it >= 0x5D && *it <= 0xFF))
-			out << *it;
-		else if (*it == '"')
-			out << "\\\"";
-		else if (*it == '\\')
-			out << "\\\\";
-		else if (*it == '\b')
-			out << "\\b";
-		else if (*it == '\f')
-			out << "\\f";
-		else if (*it == '\n')
-			out << "\\n";
-		else if (*it == '\r')
-			out << "\\r";
-		else if (*it == '\t')
-			out << "\\t";
-		else if ( *it == '\0' )
-			out << "\\u0000";
-		else
-		{
-			const char *hexdigits = "0123456789ABCDEF";
-			unsigned long u = (std::min)(static_cast<unsigned long>(static_cast<unsigned char>(*it)), 0xFFFFul);
-			int d1 = u / 4096; u -= d1 * 4096;
-			int d2 = u / 256; u -= d2 * 256;
-			int d3 = u / 16; u -= d3 * 16;
-			int d4 = u;
-			out << "\\u";
-			out << hexdigits[d1];
-			out << hexdigits[d2];
-			out << hexdigits[d3];
-			out << hexdigits[d4];
-		}
+		if (*it <= 0x1F || *it == '"' || *it == '\\') out << '\\';
+		out << *it;
 	}
 	out << '"';
 }

+
 } }  // Namespace Poco::JSON
--- a/JSON/testsuite/src/JSONTest.cpp
+++ b/JSON/testsuite/src/JSONTest.cpp
@ -1224,12 +1224,14 @@ void JSONTest::testPrintHandler()
 void JSONTest::testStringify()
 {
 	Object jObj(false);
-	jObj.set("foo", 0);
-	jObj.set("bar", 0);
+	jObj.set("foo\\", 0);
+	jObj.set("bar/", 0);
 	jObj.set("baz", 0);
+	jObj.set("q\"uote\"d", 0);
 	std::stringstream ss;
 	jObj.stringify(ss);
-	assert(ss.str() == "{\"bar\":0,\"baz\":0,\"foo\":0}");
+
+	assert(ss.str() == "{\"bar/\":0,\"baz\":0,\"foo\\\\\":0,\"q\\\"uote\\\"d\":0}");

 	std::string json = "{ \"Simpsons\" : { \"husband\" : { \"name\" : \"Homer\" , \"age\" : 38 }, \"wife\" : { \"name\" : \"Marge\", \"age\" : 36 }, "
 						"\"children\" : [ \"Bart\", \"Lisa\", \"Maggie\" ], "
@ -1269,6 +1271,7 @@ void JSONTest::testStringify()
 						"\"wife\":{"
 						"\"age\":36,\"name\":\"Marge\""
 						"}}}";
+
 	assert (ostr.str() == str);

 	ostr.str("");
@ -1653,16 +1656,9 @@ void JSONTest::testUnicode()
 	Parser parser;

 	Var result;
-	try
-	{
-		parser.parse(json);
-		result = parser.asVar();
-	}
-	catch(JSONException& jsone)
-	{
-		std::cout << jsone.message() << std::endl;
-		assert(false);
-	}
+	parser.parse(json);
+	result = parser.asVar();
+
 	assert(result.type() == typeid(Object::Ptr));

 	Object::Ptr object = result.extract<Object::Ptr>();
@ -1675,6 +1671,54 @@ void JSONTest::testUnicode()
 	converter.convert(text, original);

 	assert(test.convert<std::string>() == original);
+
+	parser.reset();
+	std::ostringstream os;
+	os << '[' << (char) 0x92 << ']';
+	try
+	{
+		parser.parse(os.str());
+		fail("Invalid Unicode sequence, must fail.");
+	}
+	catch (JSONException&) {}
+
+	parser.reset();
+	os.str("");
+	os << '[' << (char)0xC2 << (char)0x92 << ']';
+	result = parser.parse(os.str());
+	assert(result.type() == typeid(Poco::JSON::Array::Ptr));
+
+	parser.reset();
+	os.str("");
+	os << '[' << (char)0xAC << ']';
+	try
+	{
+		parser.parse(os.str());
+		fail("Invalid Unicode sequence, must fail.");
+	}
+	catch (JSONException&) {}
+
+	parser.reset();
+	os.str("");
+	os << '[' << (char)0xE2 << (char)0x82 << (char)0xAC << ']';
+	result = parser.parse(os.str());
+	assert(result.type() == typeid(Poco::JSON::Array::Ptr));
+
+	parser.reset();
+	os.str("");
+	os << '[' << (char)0xA2 << ']';
+	try
+	{
+		parser.parse(os.str());
+		fail("Invalid Unicode sequence, must fail.");
+	}
+	catch (JSONException&){}
+
+	parser.reset();
+	os.str("");
+	os << '[' << (char)0xF0 << (char)0xA4 << (char)0xAD << (char)0xAD << ']';
+	result = parser.parse(os.str());
+	assert(result.type() == typeid(Poco::JSON::Array::Ptr));
 }