poco/JSON/src/Parser.cpp

795 lines
15 KiB
C++
Raw Normal View History

2012-11-11 09:57:01 +01:00
//
// Parser.cpp
//
// $Id$
//
// Library: JSON
// Package: JSON
// Module: Parser
//
// Copyright (c) 2012, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
#include "Poco/JSON/Parser.h"
#include "Poco/JSON/JSONException.h"
#include "Poco/Ascii.h"
#include "Poco/Token.h"
#include "Poco/UTF8Encoding.h"
2012-11-11 09:57:01 +01:00
#undef min
#undef max
#include <limits>
namespace Poco {
namespace JSON {
class SeparatorToken: public Token
{
public:
SeparatorToken()
{
}
virtual ~SeparatorToken()
{
}
Class tokenClass() const
{
return Token::SEPARATOR_TOKEN;
}
bool start(char c, std::istream& istr)
{
2013-01-05 04:51:26 +01:00
if (c == '{'
|| c == '}'
|| c == ']'
|| c == '['
|| c == ','
|| c == ':')
2012-11-11 09:57:01 +01:00
{
_value = c;
return true;
}
if ( c == '\'' )
{
throw JSONException("Invalid quote found");
}
else return false;
}
void finish(std::istream& istr)
{
}
};
class StringToken: public Token
{
public:
StringToken()
{
}
virtual ~StringToken()
{
}
Class tokenClass() const
{
return Token::STRING_LITERAL_TOKEN;
}
bool start(char c, std::istream& istr)
{
2013-01-05 04:51:26 +01:00
if (c == '"')
2012-11-11 09:57:01 +01:00
{
_value = ""; // We don't need the quote!
return true;
}
else return false;
}
void finish(std::istream& istr)
{
int c = 0;
while ((c = istr.get()) != -1)
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (c == 0)
2012-11-11 09:57:01 +01:00
{
throw JSONException("Null byte not allowed");
}
if ( 0 < c && c <= 0x1F )
{
throw JSONException(format("Control character 0x%x not allowed", (unsigned int) c));
}
2013-01-05 04:51:26 +01:00
if (c == '"')
2012-11-11 09:57:01 +01:00
break;
if(0x80 <= c && c <= 0xFF)
{
int count = utf8_check_first(c);
if (!count)
{
throw JSONException(format("Unable to decode byte 0x%x", (unsigned int) c));
}
char buffer[5];
buffer[0] = c;
for(int i = 1; i < count; ++i)
{
buffer[i] = istr.get();
}
if ( !UTF8Encoding::isLegal((unsigned char*) buffer, count) )
{
throw JSONException("No legal UTF8 found");
}
buffer[count] = '\0';
_value += buffer;
continue;
}
2012-11-11 09:57:01 +01:00
2013-01-05 04:51:26 +01:00
if (c == '\\') // Escaped String
2012-11-11 09:57:01 +01:00
{
c = istr.get();
switch(c)
{
2013-01-05 04:51:26 +01:00
case '"' : c = '"'; break;
case '\\' : c = '\\'; break;
case '/' : c = '/'; break;
case 'b' : c = '\b'; break;
case 'f' : c = '\f'; break;
case 'n' : c = '\n'; break;
case 'r' : c = '\r'; break;
case 't' : c = '\t'; break;
2012-11-11 09:57:01 +01:00
case 'u' : // Unicode
{
Poco::Int32 unicode = decodeUnicode(istr);
if ( unicode == 0 )
{
throw JSONException("\\u0000 is not allowed");
}
if ( unicode >= 0xD800 && unicode <= 0xDBFF )
{
c = istr.get();
if ( c != '\\' )
{
throw JSONException("Invalid unicode surrogate pair");
}
c = istr.get();
if ( c != 'u' )
{
throw JSONException("Invalid unicode surrogate pair");
}
Poco::Int32 surrogatePair = decodeUnicode(istr);
if ( 0xDC00 <= surrogatePair && surrogatePair <= 0xDFFF )
{
unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
}
else
{
throw JSONException("Invalid unicode surrogate pair");
}
}
else if ( 0xDC00 <= unicode && unicode <= 0xDFFF )
{
throw JSONException("Invalid unicode");
}
Poco::UTF8Encoding utf8encoding;
int length = utf8encoding.convert(unicode, NULL, 0);
std::vector<unsigned char> convert(length);
utf8encoding.convert(unicode, &convert[0], length);
for(int i = 0; i < length; ++i)
{
_value += (char) convert[i];
}
continue;
2012-11-11 09:57:01 +01:00
}
default:
{
throw JSONException(format("Invalid escape '%c' character used", (char) c));
}
}
}
_value += c;
2012-11-11 09:57:01 +01:00
}
if ( c == -1 )
{
throw JSONException("Unterminated string found");
}
}
Poco::Int32 decodeUnicode(std::istream& istr)
{
Poco::Int32 value = 0;
for(int i = 0; i < 4; i++)
{
value <<= 4;
int nc = istr.peek();
if ( nc == -1 )
{
throw JSONException("Invalid unicode sequence");
}
istr.get(); // No EOF, so read the character
if (nc >= '0' && nc <= '9')
value += nc - '0';
else if (nc >= 'A' && nc <= 'F')
value += 10 + nc - 'A';
else if (nc >= 'a' && nc <= 'f')
value += 10 + nc - 'a';
else
throw JSONException("Invalid unicode sequence. Hexadecimal digit expected");
}
return value;
}
private:
int utf8_check_first(char byte)
{
unsigned char u = (unsigned char) byte;
if(u < 0x80)
return 1;
if (0x80 <= u && u <= 0xBF)
{
/* second, third or fourth byte of a multi-byte
sequence, i.e. a "continuation byte" */
return 0;
}
else if(u == 0xC0 || u == 0xC1)
{
/* overlong encoding of an ASCII byte */
return 0;
}
else if(0xC2 <= u && u <= 0xDF)
{
/* 2-byte sequence */
return 2;
}
else if(0xE0 <= u && u <= 0xEF)
{
/* 3-byte sequence */
return 3;
}
else if(0xF0 <= u && u <= 0xF4)
{
/* 4-byte sequence */
return 4;
}
else
{
/* u >= 0xF5 */
/* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
UTF-8 */
return 0;
}
}
2012-11-11 09:57:01 +01:00
};
class KeywordToken : public Token
{
public:
KeywordToken()
{
}
virtual ~KeywordToken()
{
}
Class tokenClass() const
{
return Token::KEYWORD_TOKEN;
}
bool start(char c, std::istream& istr)
{
if ( Ascii::isAlpha(c) )
{
_value = c;
return true;
}
return false;
}
void finish(std::istream& istr)
{
int c = istr.peek();
while (c != -1 && Ascii::isAlpha(c) )
{
istr.get();
_value += c;
c = istr.peek();
}
}
};
class NumberToken: public Token
{
public:
NumberToken() : _activeClass(INTEGER_LITERAL_TOKEN)
{
}
virtual ~NumberToken()
{
}
Class tokenClass() const
{
return _activeClass;
}
bool start(char c, std::istream& istr)
{
// Reset the active class to integer
_activeClass = INTEGER_LITERAL_TOKEN;
if ( c == -1 )
return false;
2013-01-05 04:51:26 +01:00
if (Ascii::isDigit(c))
2012-11-11 09:57:01 +01:00
{
if ( c == '0' )
{
int nc = istr.peek();
if ( Ascii::isDigit(nc) ) // A digit after a zero is not allowed
{
throw JSONException("Number can't start with a zero");
}
}
_value = c;
return true;
}
2013-01-05 04:51:26 +01:00
if (c == '-')
2012-11-11 09:57:01 +01:00
{
_value = c;
int nc = istr.peek();
2013-01-05 04:51:26 +01:00
if (Ascii::isDigit(nc))
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (nc == '0')
2012-11-11 09:57:01 +01:00
{
_value += '0';
istr.get();
nc = istr.peek();
if ( Ascii::isDigit(nc) ) // A digit after -0 is not allowed
{
throw JSONException("Number can't start with a zero");
}
}
return true;
}
}
return false;
}
void finish(std::istream& istr)
{
int c;
while( (c = istr.peek()) != -1)
{
2013-01-05 04:51:26 +01:00
if (Ascii::isDigit(c))
2012-11-11 09:57:01 +01:00
{
_value += c;
istr.get();
}
else
{
switch(c)
{
case '.': // Float
{
2013-01-05 04:51:26 +01:00
if (_activeClass == Token::FLOAT_LITERAL_TOKEN)
2012-11-11 09:57:01 +01:00
{
throw JSONException("Invalid float value");
}
_activeClass = Token::FLOAT_LITERAL_TOKEN;
_value += c;
istr.get();
// After a . we need a digit
c = istr.peek();
if ( ! Ascii::isDigit(c) )
{
throw JSONException("Invalid float value");
}
break;
}
case 'E':
case 'e':
{
2013-01-05 04:51:26 +01:00
if (_activeClass == Token::DOUBLE_LITERAL_TOKEN)
2012-11-11 09:57:01 +01:00
{
throw JSONException("Invalid double value");
}
_activeClass = Token::DOUBLE_LITERAL_TOKEN;
// Add the e or E
_value += c;
istr.get();
// When the next char is - or + then read the next char
c = istr.peek();
2013-01-05 04:51:26 +01:00
if (c == '-' || c == '+')
2012-11-11 09:57:01 +01:00
{
_value += c;
istr.get();
c = istr.peek();
}
2013-01-05 04:51:26 +01:00
if (! Ascii::isDigit(c))
2012-11-11 09:57:01 +01:00
{
throw JSONException("Invalid double value");
}
break;
}
default:
return; // End of number token
}
istr.get(); // If we get here we have a valid character for a number
_value += c;
}
}
}
private:
Class _activeClass;
};
Parser::Parser() : _tokenizer(), _handler(NULL)
{
_tokenizer.addToken(new WhitespaceToken());
_tokenizer.addToken(new InvalidToken());
_tokenizer.addToken(new SeparatorToken());
_tokenizer.addToken(new StringToken());
_tokenizer.addToken(new NumberToken());
_tokenizer.addToken(new KeywordToken());
}
Parser::~Parser()
{
}
const Token* Parser::nextToken()
{
const Token* token = _tokenizer.next();
2013-01-05 04:51:26 +01:00
if (token->is(Token::EOF_TOKEN))
2012-11-11 09:57:01 +01:00
{
throw JSONException("Unexpected EOF found");
}
return token;
}
void Parser::parse(std::istream& in)
{
_tokenizer.attachToStream(in);
const Token* token = nextToken();
2013-01-05 04:51:26 +01:00
if (token->is(Token::SEPARATOR_TOKEN))
2012-11-11 09:57:01 +01:00
{
// This must be a { or a [
2013-01-05 04:51:26 +01:00
if (token->asChar() == '{')
2012-11-11 09:57:01 +01:00
{
readObject();
}
2013-01-05 04:51:26 +01:00
else if (token->asChar() == '[')
2012-11-11 09:57:01 +01:00
{
readArray();
}
else
{
throw JSONException(format("Invalid separator '%c' found. Expecting { or [", token->asChar()));
}
token = _tokenizer.next();
2013-01-05 04:51:26 +01:00
if (! token->is(Token::EOF_TOKEN))
2012-11-11 09:57:01 +01:00
{
throw JSONException(format("EOF expected but found '%s'", token->asString()));
}
}
else
{
throw JSONException(format("Invalid token '%s' found. Expecting { or [", token->asString()));
}
}
void Parser::readObject()
{
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->startObject();
}
if ( readRow(true) ) // First call is special: check for empty object
{
while(readRow());
}
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->endObject();
}
}
bool Parser::readRow(bool firstCall)
{
const Token* token = nextToken();
2013-01-05 04:51:26 +01:00
if (firstCall && token->tokenClass() == Token::SEPARATOR_TOKEN && token->asChar() == '}')
2012-11-11 09:57:01 +01:00
{
return false; // End of object is possible for an empty object
}
2013-01-05 04:51:26 +01:00
if (token->tokenClass() == Token::STRING_LITERAL_TOKEN)
2012-11-11 09:57:01 +01:00
{
std::string propertyName = token->tokenString();
if ( _handler != NULL )
{
_handler->key(propertyName);
}
token = nextToken();
2013-02-21 00:08:44 +01:00
if (token->is(Token::SEPARATOR_TOKEN)
&& token->asChar() == ':')
2012-11-11 09:57:01 +01:00
{
readValue(nextToken());
token = nextToken();
2013-01-05 04:51:26 +01:00
if (token->is(Token::SEPARATOR_TOKEN))
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (token->asChar() == ',')
2012-11-11 09:57:01 +01:00
{
_handler->comma();
2012-11-11 09:57:01 +01:00
return true; // Read next row
}
2013-01-05 04:51:26 +01:00
else if (token->asChar() == '}')
2012-11-11 09:57:01 +01:00
{
return false; // End of object
}
else
{
throw JSONException(format("Invalid separator '%c' found. Expecting , or }", token->asChar()));
}
}
else
{
throw JSONException(format("Invalid token '%s' found. Expecting , or }", token->asString()));
}
}
else
{
throw JSONException(format("Invalid token '%s' found. Expecting :", token->asString()));
}
}
else
{
throw JSONException(format("Invalid token '%s' found. Expecting key", token->asString()));
}
}
void Parser::readValue(const Token* token)
{
switch(token->tokenClass())
{
default:
case Token::IDENTIFIER_TOKEN:
case Token::OPERATOR_TOKEN:
case Token::CHAR_LITERAL_TOKEN:
break;
case Token::INTEGER_LITERAL_TOKEN:
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
#if defined(POCO_HAVE_INT64)
try
{
Int64 value = token->asInteger64();
// if number is 32-bit, then handle as such
if ( value > std::numeric_limits<int>::max()
|| value < std::numeric_limits<int>::min() )
{
_handler->value(value);
}
else
{
_handler->value(static_cast<int>(value));
}
}
// try to handle error as unsigned in case of overflow
catch ( const SyntaxException& )
{
UInt64 value = token->asUnsignedInteger64();
// if number is 32-bit, then handle as such
if ( value > std::numeric_limits<unsigned>::max() )
{
_handler->value(value);
}
else
{
_handler->value(static_cast<unsigned>(value));
}
}
2012-11-11 09:57:01 +01:00
#else
try
{
int value = token->asInteger();
_handle->value(value);
}
// try to handle error as unsigned in case of overflow
catch ( const SyntaxException& )
{
unsigned value = token->asUnsignedInteger();
_handle->value(value);
}
2012-11-11 09:57:01 +01:00
#endif
}
break;
case Token::KEYWORD_TOKEN:
{
2013-01-05 04:51:26 +01:00
if (token->tokenString().compare("null") == 0)
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->null();
}
}
2013-01-05 04:51:26 +01:00
else if (token->tokenString().compare("true") == 0)
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->value(true);
}
}
2013-01-05 04:51:26 +01:00
else if (token->tokenString().compare("false") == 0)
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->value(false);
}
}
else
{
throw JSONException(format("Invalid keyword '%s' found", token->asString()));
}
break;
}
case Token::FLOAT_LITERAL_TOKEN:
// Fall through
case Token::DOUBLE_LITERAL_TOKEN:
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->value(token->asFloat());
}
break;
case Token::STRING_LITERAL_TOKEN:
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->value(token->tokenString());
}
break;
case Token::SEPARATOR_TOKEN:
{
2013-01-05 04:51:26 +01:00
if (token->asChar() == '{')
{
readObject();
}
2013-01-05 04:51:26 +01:00
else if (token->asChar() == '[')
{
readArray();
}
break;
2012-11-11 09:57:01 +01:00
}
case Token::INVALID_TOKEN:
throw JSONException(format("Invalid token '%s' found", token->asString()));
2012-11-11 09:57:01 +01:00
}
}
void Parser::readArray()
{
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->startArray();
}
2013-01-05 04:51:26 +01:00
if (readElements(true)) // First call is special: check for empty array
2012-11-11 09:57:01 +01:00
{
while(readElements());
}
2013-01-05 04:51:26 +01:00
if (_handler != NULL)
2012-11-11 09:57:01 +01:00
{
_handler->endArray();
}
}
bool Parser::readElements(bool firstCall)
{
const Token* token = nextToken();
2013-01-05 04:51:26 +01:00
if (firstCall && token->is(Token::SEPARATOR_TOKEN) && token->asChar() == ']')
2012-11-11 09:57:01 +01:00
{
// End of array is possible for an empty array
return false;
}
readValue(token);
token = nextToken();
if (token->is(Token::SEPARATOR_TOKEN))
2012-11-11 09:57:01 +01:00
{
2013-01-05 04:51:26 +01:00
if (token->asChar() == ']')
2012-11-11 09:57:01 +01:00
return false; // End of array
2013-01-05 04:51:26 +01:00
if (token->asChar() == ',')
{
_handler->comma();
2012-11-11 09:57:01 +01:00
return true;
}
2012-11-11 09:57:01 +01:00
throw JSONException(format("Invalid separator '%c' found. Expecting , or ]", token->asChar()));
}
throw JSONException(format("Invalid token '%s' found.", token->asString()));
}
} } // namespace Poco::JSON