From 5964ae0a803b9b07bde50459365a9fcc9c881118 Mon Sep 17 00:00:00 2001 From: fbraem Date: Fri, 4 Jan 2013 17:09:04 +0100 Subject: [PATCH] JSON unicode fixes and running tests on invalid unicode JSON --- JSON/src/Parser.cpp | 138 +++++++++++++++++++++++++------- JSON/testsuite/src/JSONTest.cpp | 108 ++++++++++++++++++++----- JSON/testsuite/src/JSONTest.h | 3 +- 3 files changed, 199 insertions(+), 50 deletions(-) diff --git a/JSON/src/Parser.cpp b/JSON/src/Parser.cpp index 54fe97239..2a8f7b8d8 100644 --- a/JSON/src/Parser.cpp +++ b/JSON/src/Parser.cpp @@ -38,6 +38,7 @@ #include "Poco/JSON/JSONException.h" #include "Poco/Ascii.h" #include "Poco/Token.h" +#include "Poco/UTF8Encoding.h" #undef min #undef max #include @@ -66,11 +67,11 @@ public: bool start(char c, std::istream& istr) { if ( c == '{' - || c == '}' - || c == ']' - || c == '[' - || c == ',' - || c == ':' ) + || c == '}' + || c == ']' + || c == '[' + || c == ',' + || c == ':' ) { _value = c; return true; @@ -118,8 +119,8 @@ public: void finish(std::istream& istr) { - int c = istr.get(); - while (c != -1) + int c = 0; + while ((c = istr.get()) != -1) { if ( c == 0 ) { @@ -133,6 +134,31 @@ public: if ( c == '"' ) break; + + if(0x80 <= c && c <= 0xFF) + { + int count = utf8_check_first(c); + if (!count) + { + throw JSONException(format("Unable to decode byte 0x%x", (unsigned int) c)); + } + + char buffer[5]; + buffer[0] = c; + for(int i = 1; i < count; ++i) + { + buffer[i] = istr.get(); + } + + if ( !UTF8Encoding::isLegal((unsigned char*) buffer, count) ) + { + throw JSONException("No legal UTF8 found"); + } + buffer[count] = '\0'; + _value += buffer; + + continue; + } if ( c == '\\' ) // Escaped String { @@ -196,8 +222,16 @@ public: { throw JSONException("Invalid unicode"); } - c = unicode; - break; + + Poco::UTF8Encoding utf8encoding; + int length = utf8encoding.convert(unicode, NULL, 0); + std::vector convert(length); + utf8encoding.convert(unicode, &convert[0], length); + for(int i = 0; i < length; ++i) + { + _value += (char) convert[i]; + } + continue; } default: { @@ -206,7 +240,6 @@ public: } } _value += c; - c = istr.get(); } if ( c == -1 ) @@ -241,6 +274,49 @@ public: return value; } + +private: + int utf8_check_first(char byte) + { + unsigned char u = (unsigned char) byte; + + if(u < 0x80) + return 1; + + if (0x80 <= u && u <= 0xBF) + { + /* second, third or fourth byte of a multi-byte + sequence, i.e. a "continuation byte" */ + return 0; + } + else if(u == 0xC0 || u == 0xC1) + { + /* overlong encoding of an ASCII byte */ + return 0; + } + else if(0xC2 <= u && u <= 0xDF) + { + /* 2-byte sequence */ + return 2; + } + else if(0xE0 <= u && u <= 0xEF) + { + /* 3-byte sequence */ + return 3; + } + else if(0xF0 <= u && u <= 0xF4) + { + /* 4-byte sequence */ + return 4; + } + else + { + /* u >= 0xF5 */ + /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid + UTF-8 */ + return 0; + } + } }; @@ -524,7 +600,7 @@ bool Parser::readRow(bool firstCall) token = nextToken(); if ( token->is(Token::SEPARATOR_TOKEN) - && token->asChar() == ':' ) + && token->asChar() == ':' ) { readValue(nextToken()); @@ -576,17 +652,17 @@ void Parser::readValue(const Token* token) if ( _handler != NULL ) { #if defined(POCO_HAVE_INT64) - Int64 value = token->asInteger64(); - // if number is 32-bit, then handle as such + Int64 value = token->asInteger64(); + // if number is 32-bit, then handle as such if ( value > std::numeric_limits::max() - || value < std::numeric_limits::min() ) - { - _handler->value(value); - } - else - { - _handler->value(static_cast(value)); - } + || value < std::numeric_limits::min() ) + { + _handler->value(value); + } + else + { + _handler->value(static_cast(value)); + } #else int value = token->asInteger(); _handle->value(value); @@ -637,17 +713,19 @@ void Parser::readValue(const Token* token) } break; case Token::SEPARATOR_TOKEN: - { - if ( token->asChar() == '{' ) { - readObject(); + if ( token->asChar() == '{' ) + { + readObject(); + } + else if ( token->asChar() == '[' ) + { + readArray(); + } + break; } - else if ( token->asChar() == '[' ) - { - readArray(); - } - break; - } + case Token::INVALID_TOKEN: + throw JSONException(format("Invalid token '%s' found", token->asString())); } } diff --git a/JSON/testsuite/src/JSONTest.cpp b/JSON/testsuite/src/JSONTest.cpp index dbb2521da..f2167e3ee 100644 --- a/JSON/testsuite/src/JSONTest.cpp +++ b/JSON/testsuite/src/JSONTest.cpp @@ -33,6 +33,7 @@ #include "JSONTest.h" #include "CppUnit/TestCaller.h" #include "CppUnit/TestSuite.h" + #include "Poco/JSON/Object.h" #include "Poco/JSON/Parser.h" #include "Poco/JSON/Query.h" @@ -40,13 +41,17 @@ #include "Poco/JSON/Stringifier.h" #include "Poco/JSON/DefaultHandler.h" #include "Poco/JSON/Template.h" + #include "Poco/Path.h" #include "Poco/Environment.h" #include "Poco/File.h" #include "Poco/FileStream.h" #include "Poco/Glob.h" -#include +#include "Poco/UTF8Encoding.h" +#include "Poco/Latin1Encoding.h" +#include "Poco/TextConverter.h" +#include using namespace Poco::JSON; using namespace Poco::Dynamic; @@ -74,22 +79,6 @@ void JSONTest::tearDown() } -void JSONTest::testStringifier() -{ - Object obj; - - Array arr; - Object obj2; - - obj.set("array", arr); - obj.set("obj2", obj2); - - std::ostringstream ostr; - obj.stringify(ostr); - assert (ostr.str() == "{\"array\":[],\"obj2\":{}}"); -} - - void JSONTest::testNullProperty() { std::string json = "{ \"test\" : null }"; @@ -845,6 +834,50 @@ void JSONTest::testInvalidJanssonFiles() } +void JSONTest::testInvalidUnicodeJanssonFiles() +{ + Poco::Path pathPattern(getTestFilesPath("invalid-unicode")); + + std::set paths; + Poco::Glob::glob(pathPattern, paths); + + for(std::set::iterator it = paths.begin(); it != paths.end(); ++it) + { + Poco::Path filePath(*it, "input"); + + if ( filePath.isFile() ) + { + Poco::File inputFile(filePath); + if ( inputFile.exists() ) + { + Poco::FileInputStream fis(filePath.toString()); + std::cout << filePath.toString() << std::endl; + + Parser parser; + Var result; + + try + { + DefaultHandler handler; + parser.setHandler(&handler); + parser.parse(fis); + result = handler.result(); + // We shouldn't get here. + std::cout << "We didn't get an exception. This is the result: " << result.convert() << std::endl; + fail(result.convert()); + } + catch(JSONException&) + { + continue; + } + catch(Poco::SyntaxException&) + { } + } + } + } +} + + void JSONTest::testTemplate() { Template tpl; @@ -858,6 +891,40 @@ void JSONTest::testTemplate() tpl.render(data, std::cout); } +void JSONTest::testUnicode() +{ + const unsigned char supp[] = {0x61, 0xE1, 0xE9, 0x78, 0xED, 0xF3, 0xFA, 0x0}; + std::string text((const char*) supp); + + std::string json = "{ \"test\" : \"a\\u00E1\\u00E9x\\u00ED\\u00F3\\u00FA\" }"; + Parser parser; + + Var result; + try + { + DefaultHandler handler; + parser.setHandler(&handler); + parser.parse(json); + result = handler.result(); + } + catch(JSONException& jsone) + { + std::cout << jsone.message() << std::endl; + assert(false); + } + assert(result.type() == typeid(Object::Ptr)); + + Object::Ptr object = result.extract(); + Var test = object->get("test"); + + Poco::Latin1Encoding latin1; + Poco::UTF8Encoding utf8; + Poco::TextConverter converter(latin1, utf8); + std::string original; + converter.convert(text, original); + + assert(test.convert() == original); +} std::string JSONTest::getTestFilesPath(const std::string& type) { @@ -879,8 +946,10 @@ std::string JSONTest::getTestFilesPath(const std::string& type) if (Poco::File(pathPattern).exists()) validDir += '*'; else + { + std::cout << "Can't find " << validDir << std::endl; throw Poco::NotFoundException("cannot locate directory containing valid JSON test files"); - + } return validDir; } @@ -889,7 +958,6 @@ CppUnit::Test* JSONTest::suite() { CppUnit::TestSuite* pSuite = new CppUnit::TestSuite("JSONTest"); - CppUnit_addTest(pSuite, JSONTest, testStringifier); CppUnit_addTest(pSuite, JSONTest, testNullProperty); CppUnit_addTest(pSuite, JSONTest, testTrueProperty); CppUnit_addTest(pSuite, JSONTest, testFalseProperty); @@ -917,7 +985,9 @@ CppUnit::Test* JSONTest::suite() CppUnit_addTest(pSuite, JSONTest, testQuery); CppUnit_addTest(pSuite, JSONTest, testValidJanssonFiles); CppUnit_addTest(pSuite, JSONTest, testInvalidJanssonFiles); + CppUnit_addTest(pSuite, JSONTest, testInvalidUnicodeJanssonFiles); CppUnit_addTest(pSuite, JSONTest, testTemplate); + CppUnit_addTest(pSuite, JSONTest, testUnicode); return pSuite; } diff --git a/JSON/testsuite/src/JSONTest.h b/JSON/testsuite/src/JSONTest.h index 956181b09..53bd8b750 100644 --- a/JSON/testsuite/src/JSONTest.h +++ b/JSON/testsuite/src/JSONTest.h @@ -46,7 +46,6 @@ public: JSONTest(const std::string& name); ~JSONTest(); - void testStringifier(); void testNullProperty(); void testTrueProperty(); void testFalseProperty(); @@ -76,6 +75,8 @@ public: void testInvalidJanssonFiles(); void testTemplate(); void testItunes(); + void testUnicode(); + void testInvalidUnicodeJanssonFiles(); void setUp(); void tearDown();