rabbit/rabbit/Lexer.cpp

539 lines
12 KiB
C++

/**
* @author Alberto DEMICHELIS
* @author Edouard DUPIN
* @copyright 2018, Edouard DUPIN, all right reserved
* @copyright 2003-2017, Alberto DEMICHELIS, all right reserved
* @license MPL-2 (see license file)
*/
#include <rabbit/Lexer.hpp>
#include <rabbit/Table.hpp>
#include <rabbit/String.hpp>
#include <rabbit/sqconfig.hpp>
#define CUR_CHAR (_currdata)
#define RETURN_TOKEN(t) { _prevtoken = _curtoken; _curtoken = t; return t;}
#define IS_EOB() (CUR_CHAR <= RABBIT_EOB)
#define NEXT() {next();_currentcolumn++;}
#define INIT_TEMP_STRING() { _longstr.resize(0);}
#define APPEND_CHAR(c) { _longstr.pushBack(c);}
#define TERMINATE_BUFFER() {_longstr.pushBack('\0');}
#define ADD_KEYWORD(key,id) _keywords->newSlot( rabbit::String::create(ss, #key) ,int64_t(id))
rabbit::Lexer::Lexer(){}
rabbit::Lexer::~Lexer()
{
_keywords->release();
}
void rabbit::Lexer::init(rabbit::SharedState *ss, SQLEXREADFUNC rg, rabbit::UserPointer up,compilererrorFunc efunc,void *ed)
{
_errfunc = efunc;
_errtarget = ed;
_sharedstate = ss;
_keywords = rabbit::Table::create(ss, 37);
ADD_KEYWORD(while, TK_WHILE);
ADD_KEYWORD(do, TK_DO);
ADD_KEYWORD(if, TK_IF);
ADD_KEYWORD(else, TK_ELSE);
ADD_KEYWORD(break, TK_BREAK);
ADD_KEYWORD(continue, TK_CONTINUE);
ADD_KEYWORD(return, TK_RETURN);
ADD_KEYWORD(null, TK_NULL);
ADD_KEYWORD(function, TK_FUNCTION);
ADD_KEYWORD(local, TK_LOCAL);
ADD_KEYWORD(for, TK_FOR);
ADD_KEYWORD(foreach, TK_FOREACH);
ADD_KEYWORD(in, TK_IN);
ADD_KEYWORD(typeof, TK_TYPEOF);
ADD_KEYWORD(base, TK_BASE);
ADD_KEYWORD(delete, TK_DELETE);
ADD_KEYWORD(try, TK_TRY);
ADD_KEYWORD(catch, TK_CATCH);
ADD_KEYWORD(throw, TK_THROW);
ADD_KEYWORD(clone, TK_CLONE);
ADD_KEYWORD(yield, TK_YIELD);
ADD_KEYWORD(resume, TK_RESUME);
ADD_KEYWORD(switch, TK_SWITCH);
ADD_KEYWORD(case, TK_CASE);
ADD_KEYWORD(default, TK_DEFAULT);
ADD_KEYWORD(this, TK_THIS);
ADD_KEYWORD(class,TK_CLASS);
ADD_KEYWORD(extends,TK_EXTENDS);
ADD_KEYWORD(constructor,TK_CONSTRUCTOR);
ADD_KEYWORD(instanceof,TK_INSTANCEOF);
ADD_KEYWORD(true,TK_TRUE);
ADD_KEYWORD(false,TK_FALSE);
ADD_KEYWORD(static,TK_STATIC);
ADD_KEYWORD(enum,TK_ENUM);
ADD_KEYWORD(const,TK_CONST);
ADD_KEYWORD(__LINE__,TK___LINE__);
ADD_KEYWORD(__FILE__,TK___FILE__);
ADD_KEYWORD(rawcall, TK_RAWCALL);
_readf = rg;
_up = up;
_lasttokenline = _currentline = 1;
_currentcolumn = 0;
_prevtoken = -1;
_reached_eof = SQFalse;
next();
}
void rabbit::Lexer::error(const char *err)
{
_errfunc(_errtarget,err);
}
void rabbit::Lexer::next()
{
int64_t t = _readf(_up);
if(t > UINT8_MAX) error("Invalid character");
if(t != 0) {
_currdata = (Lexchar)t;
return;
}
_currdata = RABBIT_EOB;
_reached_eof = SQTrue;
}
const char *rabbit::Lexer::tok2Str(int64_t tok)
{
rabbit::ObjectPtr itr, key, val;
int64_t nitr;
while((nitr = _keywords->next(false,itr, key, val)) != -1) {
itr = (int64_t)nitr;
if(((int64_t)val.toInteger()) == tok)
return key.getStringValue();
}
return NULL;
}
void rabbit::Lexer::lexBlockComment()
{
bool done = false;
while(!done) {
switch(CUR_CHAR) {
case '*': { NEXT(); if(CUR_CHAR == '/') { done = true; NEXT(); }}; continue;
case '\n': _currentline++; NEXT(); continue;
case RABBIT_EOB: error("missing \"*/\" in comment");
default: NEXT();
}
}
}
void rabbit::Lexer::lexLineComment()
{
do { NEXT(); } while (CUR_CHAR != '\n' && (!IS_EOB()));
}
int64_t rabbit::Lexer::Lex()
{
_lasttokenline = _currentline;
while(CUR_CHAR != RABBIT_EOB) {
switch(CUR_CHAR){
case '\t': case '\r': case ' ': NEXT(); continue;
case '\n':
_currentline++;
_prevtoken=_curtoken;
_curtoken='\n';
NEXT();
_currentcolumn=1;
continue;
case '#': lexLineComment(); continue;
case '/':
NEXT();
switch(CUR_CHAR){
case '*':
NEXT();
lexBlockComment();
continue;
case '/':
lexLineComment();
continue;
case '=':
NEXT();
RETURN_TOKEN(TK_DIVEQ);
continue;
case '>':
NEXT();
RETURN_TOKEN(TK_ATTR_CLOSE);
continue;
default:
RETURN_TOKEN('/');
}
case '=':
NEXT();
if (CUR_CHAR != '='){ RETURN_TOKEN('=') }
else { NEXT(); RETURN_TOKEN(TK_EQ); }
case '<':
NEXT();
switch(CUR_CHAR) {
case '=':
NEXT();
if(CUR_CHAR == '>') {
NEXT();
RETURN_TOKEN(TK_3WAYSCMP);
}
RETURN_TOKEN(TK_LE)
break;
case '-': NEXT(); RETURN_TOKEN(TK_NEWSLOT); break;
case '<': NEXT(); RETURN_TOKEN(TK_SHIFTL); break;
case '/': NEXT(); RETURN_TOKEN(TK_ATTR_OPEN); break;
}
RETURN_TOKEN('<');
case '>':
NEXT();
if (CUR_CHAR == '='){ NEXT(); RETURN_TOKEN(TK_GE);}
else if(CUR_CHAR == '>'){
NEXT();
if(CUR_CHAR == '>'){
NEXT();
RETURN_TOKEN(TK_USHIFTR);
}
RETURN_TOKEN(TK_SHIFTR);
}
else { RETURN_TOKEN('>') }
case '!':
NEXT();
if (CUR_CHAR != '='){ RETURN_TOKEN('!')}
else { NEXT(); RETURN_TOKEN(TK_NE); }
case '@': {
int64_t stype;
NEXT();
if(CUR_CHAR != '"') {
RETURN_TOKEN('@');
}
if((stype=readString('"',true))!=-1) {
RETURN_TOKEN(stype);
}
error("error parsing the string");
}
case '"':
case '\'': {
int64_t stype;
if((stype=readString(CUR_CHAR,false))!=-1){
RETURN_TOKEN(stype);
}
error("error parsing the string");
}
case '{': case '}': case '(': case ')': case '[': case ']':
case ';': case ',': case '?': case '^': case '~':
{int64_t ret = CUR_CHAR;
NEXT(); RETURN_TOKEN(ret); }
case '.':
NEXT();
if (CUR_CHAR != '.'){ RETURN_TOKEN('.') }
NEXT();
if (CUR_CHAR != '.'){ error("invalid token '..'"); }
NEXT();
RETURN_TOKEN(TK_VARPARAMS);
case '&':
NEXT();
if (CUR_CHAR != '&'){ RETURN_TOKEN('&') }
else { NEXT(); RETURN_TOKEN(TK_AND); }
case '|':
NEXT();
if (CUR_CHAR != '|'){ RETURN_TOKEN('|') }
else { NEXT(); RETURN_TOKEN(TK_OR); }
case ':':
NEXT();
if (CUR_CHAR != ':'){ RETURN_TOKEN(':') }
else { NEXT(); RETURN_TOKEN(TK_DOUBLE_COLON); }
case '*':
NEXT();
if (CUR_CHAR == '='){ NEXT(); RETURN_TOKEN(TK_MULEQ);}
else RETURN_TOKEN('*');
case '%':
NEXT();
if (CUR_CHAR == '='){ NEXT(); RETURN_TOKEN(TK_MODEQ);}
else RETURN_TOKEN('%');
case '-':
NEXT();
if (CUR_CHAR == '='){ NEXT(); RETURN_TOKEN(TK_MINUSEQ);}
else if (CUR_CHAR == '-'){ NEXT(); RETURN_TOKEN(TK_MINUSMINUS);}
else RETURN_TOKEN('-');
case '+':
NEXT();
if (CUR_CHAR == '='){ NEXT(); RETURN_TOKEN(TK_PLUSEQ);}
else if (CUR_CHAR == '+'){ NEXT(); RETURN_TOKEN(TK_PLUSPLUS);}
else RETURN_TOKEN('+');
case RABBIT_EOB:
return 0;
default:{
if (isdigit(CUR_CHAR)) {
int64_t ret = readNumber();
RETURN_TOKEN(ret);
}
else if (isalpha(CUR_CHAR) || CUR_CHAR == '_') {
int64_t t = readId();
RETURN_TOKEN(t);
}
else {
int64_t c = CUR_CHAR;
if (iscntrl((int)c)) error("unexpected character(control)");
NEXT();
RETURN_TOKEN(c);
}
RETURN_TOKEN(0);
}
}
}
return 0;
}
int64_t rabbit::Lexer::getIDType(const char *s,int64_t len)
{
rabbit::ObjectPtr t;
if(_keywords->getStr(s,len, t)) {
return int64_t(t.toInteger());
}
return TK_IDENTIFIER;
}
int64_t rabbit::Lexer::addUTF8(uint64_t ch)
{
if (ch < 0x80) {
APPEND_CHAR((char)ch);
return 1;
}
if (ch < 0x800) {
APPEND_CHAR((char)((ch >> 6) | 0xC0));
APPEND_CHAR((char)((ch & 0x3F) | 0x80));
return 2;
}
if (ch < 0x10000) {
APPEND_CHAR((char)((ch >> 12) | 0xE0));
APPEND_CHAR((char)(((ch >> 6) & 0x3F) | 0x80));
APPEND_CHAR((char)((ch & 0x3F) | 0x80));
return 3;
}
if (ch < 0x110000) {
APPEND_CHAR((char)((ch >> 18) | 0xF0));
APPEND_CHAR((char)(((ch >> 12) & 0x3F) | 0x80));
APPEND_CHAR((char)(((ch >> 6) & 0x3F) | 0x80));
APPEND_CHAR((char)((ch & 0x3F) | 0x80));
return 4;
}
return 0;
}
int64_t rabbit::Lexer::processStringHexEscape(char *dest, int64_t maxdigits)
{
NEXT();
if (!isxdigit(CUR_CHAR)) error("hexadecimal number expected");
int64_t n = 0;
while (isxdigit(CUR_CHAR) && n < maxdigits) {
dest[n] = CUR_CHAR;
n++;
NEXT();
}
dest[n] = 0;
return n;
}
int64_t rabbit::Lexer::readString(int64_t ndelim,bool verbatim)
{
INIT_TEMP_STRING();
NEXT();
if(IS_EOB()) return -1;
for(;;) {
while(CUR_CHAR != ndelim) {
int64_t x = CUR_CHAR;
switch (x) {
case RABBIT_EOB:
error("unfinished string");
return -1;
case '\n':
if(!verbatim) error("newline in a constant");
APPEND_CHAR(CUR_CHAR); NEXT();
_currentline++;
break;
case '\\':
if(verbatim) {
APPEND_CHAR('\\'); NEXT();
}
else {
NEXT();
switch(CUR_CHAR) {
case 'x': {
const int64_t maxdigits = sizeof(char) * 2;
char temp[maxdigits + 1];
processStringHexEscape(temp, maxdigits);
char *stemp;
APPEND_CHAR((char)strtoul(temp, &stemp, 16));
}
break;
case 'U':
case 'u': {
const int64_t maxdigits = CUR_CHAR == 'u' ? 4 : 8;
char temp[8 + 1];
processStringHexEscape(temp, maxdigits);
char *stemp;
addUTF8(strtoul(temp, &stemp, 16));
}
break;
case 't': APPEND_CHAR('\t'); NEXT(); break;
case 'a': APPEND_CHAR('\a'); NEXT(); break;
case 'b': APPEND_CHAR('\b'); NEXT(); break;
case 'n': APPEND_CHAR('\n'); NEXT(); break;
case 'r': APPEND_CHAR('\r'); NEXT(); break;
case 'v': APPEND_CHAR('\v'); NEXT(); break;
case 'f': APPEND_CHAR('\f'); NEXT(); break;
case '0': APPEND_CHAR('\0'); NEXT(); break;
case '\\': APPEND_CHAR('\\'); NEXT(); break;
case '"': APPEND_CHAR('"'); NEXT(); break;
case '\'': APPEND_CHAR('\''); NEXT(); break;
default:
error("unrecognised escaper char");
break;
}
}
break;
default:
APPEND_CHAR(CUR_CHAR);
NEXT();
}
}
NEXT();
if(verbatim && CUR_CHAR == '"') { //double quotation
APPEND_CHAR(CUR_CHAR);
NEXT();
}
else {
break;
}
}
TERMINATE_BUFFER();
int64_t len = _longstr.size()-1;
if(ndelim == '\'') {
if(len == 0) error("empty constant");
if(len > 1) error("constant too long");
_nvalue = _longstr[0];
return TK_INTEGER;
}
_svalue = &_longstr[0];
return TK_STRING_LITERAL;
}
void LexHexadecimal(const char *s,uint64_t *res)
{
*res = 0;
while(*s != 0)
{
if(isdigit(*s)) *res = (*res)*16+((*s++)-'0');
else if(isxdigit(*s)) *res = (*res)*16+(toupper(*s++)-'A'+10);
else { assert(0); }
}
}
void LexInteger(const char *s,uint64_t *res)
{
*res = 0;
while(*s != 0)
{
*res = (*res)*10+((*s++)-'0');
}
}
int64_t scisodigit(int64_t c) { return c >= '0' && c <= '7'; }
void LexOctal(const char *s,uint64_t *res)
{
*res = 0;
while(*s != 0)
{
if(scisodigit(*s)) *res = (*res)*8+((*s++)-'0');
else { assert(0); }
}
}
int64_t isexponent(int64_t c) { return c == 'e' || c=='E'; }
#define MAX_HEX_DIGITS (sizeof(int64_t)*2)
int64_t rabbit::Lexer::readNumber()
{
#define TINT 1
#define TFLOAT 2
#define THEX 3
#define TSCIENTIFIC 4
#define TOCTAL 5
int64_t type = TINT, firstchar = CUR_CHAR;
char *sTemp;
INIT_TEMP_STRING();
NEXT();
if(firstchar == '0' && (toupper(CUR_CHAR) == 'X' || scisodigit(CUR_CHAR)) ) {
if(scisodigit(CUR_CHAR)) {
type = TOCTAL;
while(scisodigit(CUR_CHAR)) {
APPEND_CHAR(CUR_CHAR);
NEXT();
}
if(isdigit(CUR_CHAR)) error("invalid octal number");
}
else {
NEXT();
type = THEX;
while(isxdigit(CUR_CHAR)) {
APPEND_CHAR(CUR_CHAR);
NEXT();
}
if(_longstr.size() > MAX_HEX_DIGITS) error("too many digits for an Hex number");
}
}
else {
APPEND_CHAR((int)firstchar);
while (CUR_CHAR == '.' || isdigit(CUR_CHAR) || isexponent(CUR_CHAR)) {
if(CUR_CHAR == '.' || isexponent(CUR_CHAR)) type = TFLOAT;
if(isexponent(CUR_CHAR)) {
if(type != TFLOAT) error("invalid numeric format");
type = TSCIENTIFIC;
APPEND_CHAR(CUR_CHAR);
NEXT();
if(CUR_CHAR == '+' || CUR_CHAR == '-'){
APPEND_CHAR(CUR_CHAR);
NEXT();
}
if(!isdigit(CUR_CHAR)) error("exponent expected");
}
APPEND_CHAR(CUR_CHAR);
NEXT();
}
}
TERMINATE_BUFFER();
switch(type) {
case TSCIENTIFIC:
case TFLOAT:
_fvalue = (float_t)strtod(&_longstr[0],&sTemp);
return TK_FLOAT;
case TINT:
LexInteger(&_longstr[0],(uint64_t *)&_nvalue);
return TK_INTEGER;
case THEX:
LexHexadecimal(&_longstr[0],(uint64_t *)&_nvalue);
return TK_INTEGER;
case TOCTAL:
LexOctal(&_longstr[0],(uint64_t *)&_nvalue);
return TK_INTEGER;
}
return 0;
}
int64_t rabbit::Lexer::readId()
{
int64_t res;
INIT_TEMP_STRING();
do {
APPEND_CHAR(CUR_CHAR);
NEXT();
} while(isalnum(CUR_CHAR) || CUR_CHAR == '_');
TERMINATE_BUFFER();
res = getIDType(&_longstr[0],_longstr.size() - 1);
if(res == TK_IDENTIFIER || res == TK_CONSTRUCTOR) {
_svalue = &_longstr[0];
}
return res;
}