[DEV] start parsing C++ files

This commit is contained in:
Edouard DUPIN 2014-12-29 23:06:04 +01:00
parent d92b12b3e1
commit 3b85d09ffa
5 changed files with 361 additions and 20 deletions

View File

@ -18,20 +18,144 @@ eci::Lexer::~Lexer() {
}
void eci::Lexer::append(int32_t _tokenId, const std::string& _regularExpression) {
m_searchList.insert(std::make_pair(_tokenId, etk::RegExp<std::string>(_regularExpression)));
etk::RegExp<std::string>(_regularExpression).display();
ECI_INFO("CPP lexer add : '" << _regularExpression << "'");
try {
m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeBase>(_tokenId, _regularExpression)));
} catch (std::exception e){
ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what());
}
}
void eci::Lexer::appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) {
ECI_INFO("CPP lexer add section : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'");
try {
m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSection>(_tokenId, _regularExpressionStart, _regularExpressionStop)));
} catch (std::exception e){
ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what());
}
}
void eci::Lexer::appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression) {
ECI_INFO("CPP lexer add sub : [" << _tokenIdParrent << "] '" << _regularExpression << "'");
try {
m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSubBase>(_tokenId, _tokenIdParrent, _regularExpression)));
} catch (std::exception e){
ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what());
}
}
void eci::Lexer::appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) {
ECI_INFO("CPP lexer add section sub : [" << _tokenIdParrent << "] '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'");
try {
m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSubSection>(_tokenId, _tokenIdParrent, _regularExpressionStart, _regularExpressionStop)));
} catch (std::exception e){
ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what());
}
}
eci::LexerResult eci::Lexer::interprete(const std::string& _data) {
eci::LexerResult result;
eci::LexerResult result(_data);
ECI_INFO("Parse : \n" << _data);
for (auto &it : m_searchList) {
ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated());
if (it.second.parse(_data, 0, _data.size()) == true) {
ECI_INFO(" match [" << it.second.start() << ".." << it.second.stop() << "] ");
ECI_INFO(" ==> '" << std::string(_data, it.second.start(), it.second.stop()-it.second.start()) << "'");
//ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated());
if (it.second == nullptr) {
continue;
}
if (it.second->isSubParse() == true) {
continue;
}
if (result.m_list.size() == 0) {
result.m_list = it.second->parse(_data, 0, _data.size());
} else {
int32_t start = 0;
auto itList(result.m_list.begin());
while (itList != result.m_list.end()) {
if (*itList == nullptr) {
ECI_TODO("remove null shared_ptr");
++itList;
continue;
}
if ((*itList)->getStartPos() == start) {
// nothing to do ..
start = (*itList)->getStopPos();
++itList;
continue;
}
std::vector<std::shared_ptr<eci::LexerNode>> res = it.second->parse(_data, start, (*itList)->getStartPos());
// append it in the buffer:
if (res.size() > 0) {
int32_t currentPos = std::distance(result.m_list.begin(), itList) + res.size() ;
result.m_list.insert(itList, res.begin(), res.end());
itList = result.m_list.begin() + currentPos;
}
start = (*itList)->getStopPos();
++itList;
}
// Do the last element :
if (start < _data.size()) {
std::vector<std::shared_ptr<eci::LexerNode>> res = it.second->parse(_data, start, _data.size());
for (auto &itRes : res) {
result.m_list.push_back(itRes);
}
}
}
}
return result;
}
std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeBase::parse(const std::string& _data, int32_t _start, int32_t _stop) {
std::vector<std::shared_ptr<eci::LexerNode>> result;
ECI_DEBUG("parse : " << getValue());
while (true) {
std::smatch resultMatch;
std::regex_constants::match_flag_type flags = std::regex_constants::match_any;
//APPL_DEBUG("find data at : start=" << _start << " stop=" << _stop << " regex='" << m_regexValue << "'");
if ((int64_t)_stop <= (int64_t)_data.size()) {
char val = _data[_stop];
if ( val != '\n'
&& val != '\r') {
//after last char ==> not end of line ($ would not work))
flags |= std::regex_constants::match_not_eol;
}
if (!( ('a' <= val && val <= 'z')
|| ('A' <= val && val <= 'Z')
|| ('0' <= val && val <= '9')
|| val == '_')) {
flags |= std::regex_constants::match_not_eow;
}
}
if (_start>0) {
flags |= std::regex_constants::match_prev_avail;
}
std::regex_search(_data.begin()+_start, _data.begin()+_stop, resultMatch, regex, flags);
if (resultMatch.size() > 0) {
int32_t start = std::distance(_data.begin(), resultMatch[0].first);
int32_t stop = std::distance(_data.begin(), resultMatch[0].second);
ECI_DEBUG(" find data at : start=" << start << " stop=" << stop << " data='" <<std::string(_data, start, stop-start) << "'" );
_start = stop;
result.push_back(std::make_shared<eci::LexerNode>(m_tockenId, start, stop));
} else {
break;
}
}
return result;
}
std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSection::parse(const std::string& _data, int32_t _start, int32_t _stop) {
std::vector<std::shared_ptr<eci::LexerNode>> result;
ECI_TODO("later 1");
return result;
}
std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSubBase::parse(const std::string& _data, int32_t _start, int32_t _stop) {
std::vector<std::shared_ptr<eci::LexerNode>> result;
ECI_TODO("later 2");
return result;
}
std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSubSection::parse(const std::string& _data, int32_t _start, int32_t _stop) {
std::vector<std::shared_ptr<eci::LexerNode>> result;
ECI_TODO("later 3");
return result;
}

View File

@ -11,21 +11,170 @@
#include <etk/types.h>
#include <etk/stdTools.h>
#include <etk/RegExp.h>
#include <regex>
#include <map>
#include <vector>
namespace eci {
using LexerResult = std::vector<std::pair<int32_t, std::string>>;
class LexerNode {
public:
LexerNode(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) :
m_tockenId(_tockenId),
m_startPos(_startPos),
m_stopPos(_stopPos) {
}
virtual ~LexerNode() {};
int32_t m_tockenId;
int32_t m_startPos;
int32_t m_stopPos;
int32_t getStartPos() {
return m_startPos;
}
int32_t getStopPos() {
return m_stopPos;
}
};
class LexerNodeContainer : public LexerNode {
public:
LexerNodeContainer(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) :
LexerNode(_tockenId, _startPos, _stopPos) {
}
virtual ~LexerNodeContainer() {};
std::vector<std::shared_ptr<eci::LexerNode>> m_list;
};
class LexerResult {
private:
std::string m_data;
public:
LexerResult(const std::string& _data="") :
m_data(_data) {
}
~LexerResult() {};
std::vector<std::shared_ptr<eci::LexerNode>> m_list;
};
class Lexer {
private:
std::map<int32_t, etk::RegExp<std::string>> m_searchList;
#define TYPE_UNKNOW (0)
#define TYPE_BASE (1)
#define TYPE_SECTION (2)
#define TYPE_SUB_BASE (3)
#define TYPE_SUB_SECTION (4)
class Type {
protected:
int32_t m_tockenId;
std::string m_regexValue;
public:
Type(int32_t _tockenId) :
m_tockenId(_tockenId) {}
virtual ~Type() {}
virtual int32_t getType() {
return TYPE_UNKNOW;
}
int32_t getTockenId() {
return m_tockenId;
}
virtual std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop)=0;
std::string getValue() {
return m_regexValue;
};
virtual bool isSubParse() {
return false;
}
};
class TypeBase : public Type {
public:
std::regex regex;
TypeBase(int32_t _tockenId, const std::string& _regex="") :
Type(_tockenId),
regex(_regex, std::regex_constants::optimize | std::regex_constants::ECMAScript) {
m_regexValue = _regex;
}
virtual int32_t getType() {
return TYPE_BASE;
}
std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
};
class TypeSection : public Type {
public:
std::regex regexStart;
std::regex regexStop;
TypeSection(int32_t _tockenId, const std::string& _regexStart="", const std::string& _regexStop="") :
Type(_tockenId),
regexStart(_regexStart, std::regex_constants::optimize | std::regex_constants::ECMAScript),
regexStop(_regexStop, std::regex_constants::optimize | std::regex_constants::ECMAScript) {
m_regexValue = _regexStart + " -> " + _regexStop;
}
virtual int32_t getType() {
return TYPE_SECTION;
}
std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
};
class TypeSubBase : public TypeBase {
public:
int32_t parrent;
TypeSubBase(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regex="") :
TypeBase(_tockenId, _regex),
parrent(_tokenIdParrent) {}
virtual int32_t getType() {
return TYPE_SUB_BASE;
}
std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
bool isSubParse() {
return true;
}
};
class TypeSubSection : public TypeSection {
public:
int32_t parrent;
TypeSubSection(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regexStart="", const std::string& _regexStop="") :
TypeSection(_tockenId, _regexStart, _regexStop),
parrent(_tokenIdParrent) {}
virtual int32_t getType() {
return TYPE_SUB_SECTION;
}
std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
bool isSubParse() {
return true;
}
};
std::map<int32_t, std::shared_ptr<eci::Lexer::Type>> m_searchList;
public:
Lexer();
~Lexer();
/**
* @brief Append a Token recognition.
* @param[in] _tokenId Tocken id value.
* @param[in] _regularExpression reconise regular expression.
*/
void append(int32_t _tokenId, const std::string& _regularExpression);
/**
* @brief Append a Token recognition (section reconise start and stop with counting the number of start and stop).
* @param[in] _tokenId Tocken id value.
* @param[in] _regularExpressionStart reconise regular expression (start).
* @param[in] _regularExpressionStop reconise regular expression (stop).
*/
void appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop);
/**
* @brief Append a Token recognition (sub parsing).
* @param[in] _tokenIdParrent parrent Tocken id value.
* @param[in] _tokenId Tocken id value.
* @param[in] _regularExpression reconise regular expression.
*/
void appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression);
/**
* @brief Append a Token recognition (sub parsing) (section reconise start and stop with counting the number of start and stop).
* @param[in] _tokenIdParrent parrent Tocken id value.
* @param[in] _tokenId Tocken id value.
* @param[in] _regularExpressionStart reconise regular expression (start).
* @param[in] _regularExpressionStop reconise regular expression (stop).
*/
void appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop);
LexerResult interprete(const std::string& _data);
};
}

View File

@ -13,17 +13,18 @@
int main(int argc, char** argv) {
etk::log::setLevel(etk::log::logLevelDebug);
etk::log::setLevel(etk::log::logLevelInfo);
//etk::log::setLevel(etk::log::logLevelInfo);
ECI_INFO("Start Application interpeter languages");
if (argc<=1) {
ECI_CRITICAL("need the file to parse");
return -1;
}
eci::ParserCpp tmpParser;
std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]);
//std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]);
//std::string data = "alpha /* plop */ test";
//std::string data = "pp \n // qdfqdfsdf \nde";
tmpParser.parse(data);
//tmpParser.parse(data);
tmpParser.parse(etk::FSNodeReadAllData(argv[1]));
return 0;
}

View File

@ -7,19 +7,81 @@
*/
#include <eci/lang/ParserCpp.h>
#include <eci/debug.h>
enum cppTokenList {
tokenCppMultilineComment,
tokenCppSingleLineComment,
tokenCppString,
tokenCppCommentMultiline,
tokenCppCommentSingleLine,
tokenCppPreProcessor,
tokenCppPreProcessorIf,
tokenCppPreProcessorElse,
tokenCppPreProcessorEndif,
tokenCppPreProcessorIfdef,
tokenCppPreProcessorIfndef,
tokenCppPreProcessorDefine,
tokenCppPreProcessorWarning,
tokenCppPreProcessorError,
tokenCppPreProcessorInclude,
tokenCppPreProcessorImport,
tokenCppPreProcessorSectionPthese,
tokenCppStringDoubleQuote,
tokenCppStringSimpleQuote,
tokenCppSectionBrace,
tokenCppSectionPthese,
tokenCppSectionHook,
tokenCppBranch,
tokenCppSystem,
tokenCppType,
tokenCppVisibility,
tokenCppContener,
tokenCppTypeDef,
tokenCppAuto,
tokenCppNullptr,
tokenCppSystemDefine,
tokenCppNumericValue,
tokenCppBoolean,
tokenCppCondition,
tokenCppAssignation,
tokenCppString,
tokenCppSeparator,
};
eci::ParserCpp::ParserCpp() {
m_lexer.append(tokenCppMultilineComment, "/\\*.*\\*/");
m_lexer.append(tokenCppSingleLineComment, "//.*$");
m_lexer.append(82939, "/\\*.*");
m_lexer.append(tokenCppString, "[a-z]");
m_lexer.append(tokenCppCommentMultiline, "/\\*(.|\\r|\\n)*?(\\*/|\\0)");
m_lexer.append(tokenCppCommentSingleLine, "//.*");
m_lexer.append(tokenCppPreProcessor, "#(.|\\\\[\\\\\\n])*");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIf, "\\bif\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorElse, "\\belse\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorEndif, "\\bendif\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfdef, "\\bifdef\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfndef, "\\bifndef\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorDefine, "\\bdefine\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorWarning, "\\bwarning\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorError, "\\berror\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorInclude, "\\binclude\\b");
m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorImport, "\\bimport\\b"); // specific to c++ interpreted
m_lexer.appendSubSection(tokenCppPreProcessor, tokenCppPreProcessorSectionPthese, "\\(", "\\)");
m_lexer.append(tokenCppStringDoubleQuote, "\"(.|\\\\[\\\\\"])*?\"");
m_lexer.append(tokenCppStringSimpleQuote, "'\\?.'");
m_lexer.appendSection(tokenCppSectionBrace, "\\{", "\\}");
m_lexer.appendSection(tokenCppSectionPthese, "\\(", "\\)");
m_lexer.appendSection(tokenCppSectionHook, "\\[", "\\]");
m_lexer.append(tokenCppBranch, "\\b(return|goto|if|else|case|default|break|continue|while|do|for)\\b");
m_lexer.append(tokenCppSystem, "\\b(new|delete|try|catch)\\b");
m_lexer.append(tokenCppType, "\\b(bool|char(16_t|32_t)?|double|float|u?int(8|16|32|64|128)?(_t)?|long|short|signed|size_t|unsigned|void|(I|U)(8|16|32|64|128))\\b");
m_lexer.append(tokenCppVisibility, "\\b(inline|const|virtual|private|public|protected|friend|const|extern|register|static|volatile)\\b");
m_lexer.append(tokenCppContener, "\\b(class|namespace|struct|union|enum)\\b");
m_lexer.append(tokenCppTypeDef, "\\btypedef\\b");
m_lexer.append(tokenCppAuto, "\\bauto\\b");
m_lexer.append(tokenCppNullptr, "\\b(NULL|nullptr)\\b");
m_lexer.append(tokenCppSystemDefine, "\\b__(LINE|DATA|FILE|func|TIME|STDC)__\\b");
m_lexer.append(tokenCppNumericValue, "\\b(((0(x|X)[0-9a-fA-F]*)|(\\d+\\.?\\d*|\\.\\d+)((e|E)(\\+|\\-)?\\d+)?)(L|l|UL|ul|u|U|F|f)?)\\b");
m_lexer.append(tokenCppBoolean, "\\b(true|false)\\b");
m_lexer.append(tokenCppCondition, "==|>=|<=|!=|<|>|&&|\\|\\|");
m_lexer.append(tokenCppAssignation, "(=|\\*|/|-|+|&)");
m_lexer.append(tokenCppString, "\\w+");
m_lexer.append(tokenCppSeparator, "(;|,|::|:)");
}
eci::ParserCpp::~ParserCpp() {
@ -28,6 +90,10 @@ eci::ParserCpp::~ParserCpp() {
bool eci::ParserCpp::parse(const std::string& _data) {
m_result = m_lexer.interprete(_data);
ECI_INFO("find :");
for (auto &it : m_result.m_list) {
ECI_INFO(" start=" << it->getStartPos() << " stop=" << it->getStopPos() << " data='" <<std::string(_data, it->getStartPos(), it->getStopPos()-it->getStartPos()) << "'" );
}
return false;
}

View File

@ -5,6 +5,7 @@ printf("Hello\n"); /* this is a comment */ printf("Hello\n");
printf("Hello\n");
// this is also a comment sayhello();
printf("Hello\n");
printf("Hello\n"); /* this is a second comment */ printf("Hello\n");
void main() {}