From 3b85d09ffa1bf2a3adec42e6b6d92ceda4b3c41a Mon Sep 17 00:00:00 2001 From: Edouard DUPIN Date: Mon, 29 Dec 2014 23:06:04 +0100 Subject: [PATCH] [DEV] start parsing C++ files --- eci/Lexer.cpp | 138 ++++++++++++++++++++++++++++++++++-- eci/Lexer.h | 155 ++++++++++++++++++++++++++++++++++++++++- eci/eci.cpp | 7 +- eci/lang/ParserCpp.cpp | 80 +++++++++++++++++++-- tests/01_comment.c | 1 + 5 files changed, 361 insertions(+), 20 deletions(-) diff --git a/eci/Lexer.cpp b/eci/Lexer.cpp index 7b21495..3bd7bce 100644 --- a/eci/Lexer.cpp +++ b/eci/Lexer.cpp @@ -18,20 +18,144 @@ eci::Lexer::~Lexer() { } void eci::Lexer::append(int32_t _tokenId, const std::string& _regularExpression) { - m_searchList.insert(std::make_pair(_tokenId, etk::RegExp(_regularExpression))); - etk::RegExp(_regularExpression).display(); + ECI_INFO("CPP lexer add : '" << _regularExpression << "'"); + try { + m_searchList.insert(std::make_pair(_tokenId, std::make_shared(_tokenId, _regularExpression))); + } catch (std::exception e){ + ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what()); + } } +void eci::Lexer::appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) { + ECI_INFO("CPP lexer add section : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'"); + try { + m_searchList.insert(std::make_pair(_tokenId, std::make_shared(_tokenId, _regularExpressionStart, _regularExpressionStop))); + } catch (std::exception e){ + ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what()); + } +} + +void eci::Lexer::appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression) { + ECI_INFO("CPP lexer add sub : [" << _tokenIdParrent << "] '" << _regularExpression << "'"); + try { + m_searchList.insert(std::make_pair(_tokenId, std::make_shared(_tokenId, _tokenIdParrent, _regularExpression))); + } catch (std::exception e){ + ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what()); + } +} + +void eci::Lexer::appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) { + ECI_INFO("CPP lexer add section sub : [" << _tokenIdParrent << "] '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'"); + try { + m_searchList.insert(std::make_pair(_tokenId, std::make_shared(_tokenId, _tokenIdParrent, _regularExpressionStart, _regularExpressionStop))); + } catch (std::exception e){ + ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what()); + } +} + + eci::LexerResult eci::Lexer::interprete(const std::string& _data) { - eci::LexerResult result; + eci::LexerResult result(_data); ECI_INFO("Parse : \n" << _data); for (auto &it : m_searchList) { - ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated()); - if (it.second.parse(_data, 0, _data.size()) == true) { - ECI_INFO(" match [" << it.second.start() << ".." << it.second.stop() << "] "); - ECI_INFO(" ==> '" << std::string(_data, it.second.start(), it.second.stop()-it.second.start()) << "'"); + //ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated()); + if (it.second == nullptr) { + continue; + } + if (it.second->isSubParse() == true) { + continue; + } + if (result.m_list.size() == 0) { + result.m_list = it.second->parse(_data, 0, _data.size()); + } else { + int32_t start = 0; + auto itList(result.m_list.begin()); + while (itList != result.m_list.end()) { + if (*itList == nullptr) { + ECI_TODO("remove null shared_ptr"); + ++itList; + continue; + } + if ((*itList)->getStartPos() == start) { + // nothing to do .. + start = (*itList)->getStopPos(); + ++itList; + continue; + } + std::vector> res = it.second->parse(_data, start, (*itList)->getStartPos()); + // append it in the buffer: + if (res.size() > 0) { + int32_t currentPos = std::distance(result.m_list.begin(), itList) + res.size() ; + result.m_list.insert(itList, res.begin(), res.end()); + itList = result.m_list.begin() + currentPos; + } + start = (*itList)->getStopPos(); + ++itList; + } + // Do the last element : + if (start < _data.size()) { + std::vector> res = it.second->parse(_data, start, _data.size()); + for (auto &itRes : res) { + result.m_list.push_back(itRes); + } + } } } return result; } +std::vector> eci::Lexer::TypeBase::parse(const std::string& _data, int32_t _start, int32_t _stop) { + std::vector> result; + ECI_DEBUG("parse : " << getValue()); + while (true) { + std::smatch resultMatch; + std::regex_constants::match_flag_type flags = std::regex_constants::match_any; + //APPL_DEBUG("find data at : start=" << _start << " stop=" << _stop << " regex='" << m_regexValue << "'"); + if ((int64_t)_stop <= (int64_t)_data.size()) { + char val = _data[_stop]; + if ( val != '\n' + && val != '\r') { + //after last char ==> not end of line ($ would not work)) + flags |= std::regex_constants::match_not_eol; + } + if (!( ('a' <= val && val <= 'z') + || ('A' <= val && val <= 'Z') + || ('0' <= val && val <= '9') + || val == '_')) { + flags |= std::regex_constants::match_not_eow; + } + } + if (_start>0) { + flags |= std::regex_constants::match_prev_avail; + } + std::regex_search(_data.begin()+_start, _data.begin()+_stop, resultMatch, regex, flags); + if (resultMatch.size() > 0) { + int32_t start = std::distance(_data.begin(), resultMatch[0].first); + int32_t stop = std::distance(_data.begin(), resultMatch[0].second); + ECI_DEBUG(" find data at : start=" << start << " stop=" << stop << " data='" <(m_tockenId, start, stop)); + } else { + break; + } + } + return result; +} + +std::vector> eci::Lexer::TypeSection::parse(const std::string& _data, int32_t _start, int32_t _stop) { + std::vector> result; + ECI_TODO("later 1"); + return result; +} + +std::vector> eci::Lexer::TypeSubBase::parse(const std::string& _data, int32_t _start, int32_t _stop) { + std::vector> result; + ECI_TODO("later 2"); + return result; +} + +std::vector> eci::Lexer::TypeSubSection::parse(const std::string& _data, int32_t _start, int32_t _stop) { + std::vector> result; + ECI_TODO("later 3"); + return result; +} \ No newline at end of file diff --git a/eci/Lexer.h b/eci/Lexer.h index 6364045..3ea0bb9 100644 --- a/eci/Lexer.h +++ b/eci/Lexer.h @@ -11,21 +11,170 @@ #include #include -#include +#include #include #include namespace eci { - using LexerResult = std::vector>; + class LexerNode { + public: + LexerNode(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) : + m_tockenId(_tockenId), + m_startPos(_startPos), + m_stopPos(_stopPos) { + + } + virtual ~LexerNode() {}; + int32_t m_tockenId; + int32_t m_startPos; + int32_t m_stopPos; + int32_t getStartPos() { + return m_startPos; + } + int32_t getStopPos() { + return m_stopPos; + } + }; + class LexerNodeContainer : public LexerNode { + public: + LexerNodeContainer(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) : + LexerNode(_tockenId, _startPos, _stopPos) { + + } + virtual ~LexerNodeContainer() {}; + std::vector> m_list; + }; + class LexerResult { + private: + std::string m_data; + public: + LexerResult(const std::string& _data="") : + m_data(_data) { + + } + ~LexerResult() {}; + std::vector> m_list; + }; class Lexer { private: - std::map> m_searchList; + #define TYPE_UNKNOW (0) + #define TYPE_BASE (1) + #define TYPE_SECTION (2) + #define TYPE_SUB_BASE (3) + #define TYPE_SUB_SECTION (4) + class Type { + protected: + int32_t m_tockenId; + std::string m_regexValue; + public: + Type(int32_t _tockenId) : + m_tockenId(_tockenId) {} + virtual ~Type() {} + virtual int32_t getType() { + return TYPE_UNKNOW; + } + int32_t getTockenId() { + return m_tockenId; + } + virtual std::vector> parse(const std::string& _data, int32_t _start, int32_t _stop)=0; + std::string getValue() { + return m_regexValue; + }; + virtual bool isSubParse() { + return false; + } + }; + class TypeBase : public Type { + public: + std::regex regex; + TypeBase(int32_t _tockenId, const std::string& _regex="") : + Type(_tockenId), + regex(_regex, std::regex_constants::optimize | std::regex_constants::ECMAScript) { + m_regexValue = _regex; + } + virtual int32_t getType() { + return TYPE_BASE; + } + std::vector> parse(const std::string& _data, int32_t _start, int32_t _stop); + }; + class TypeSection : public Type { + public: + std::regex regexStart; + std::regex regexStop; + TypeSection(int32_t _tockenId, const std::string& _regexStart="", const std::string& _regexStop="") : + Type(_tockenId), + regexStart(_regexStart, std::regex_constants::optimize | std::regex_constants::ECMAScript), + regexStop(_regexStop, std::regex_constants::optimize | std::regex_constants::ECMAScript) { + m_regexValue = _regexStart + " -> " + _regexStop; + } + virtual int32_t getType() { + return TYPE_SECTION; + } + std::vector> parse(const std::string& _data, int32_t _start, int32_t _stop); + }; + class TypeSubBase : public TypeBase { + public: + int32_t parrent; + TypeSubBase(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regex="") : + TypeBase(_tockenId, _regex), + parrent(_tokenIdParrent) {} + virtual int32_t getType() { + return TYPE_SUB_BASE; + } + std::vector> parse(const std::string& _data, int32_t _start, int32_t _stop); + bool isSubParse() { + return true; + } + }; + class TypeSubSection : public TypeSection { + public: + int32_t parrent; + TypeSubSection(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regexStart="", const std::string& _regexStop="") : + TypeSection(_tockenId, _regexStart, _regexStop), + parrent(_tokenIdParrent) {} + virtual int32_t getType() { + return TYPE_SUB_SECTION; + } + std::vector> parse(const std::string& _data, int32_t _start, int32_t _stop); + bool isSubParse() { + return true; + } + }; + std::map> m_searchList; public: Lexer(); ~Lexer(); + /** + * @brief Append a Token recognition. + * @param[in] _tokenId Tocken id value. + * @param[in] _regularExpression reconise regular expression. + */ void append(int32_t _tokenId, const std::string& _regularExpression); + /** + * @brief Append a Token recognition (section reconise start and stop with counting the number of start and stop). + * @param[in] _tokenId Tocken id value. + * @param[in] _regularExpressionStart reconise regular expression (start). + * @param[in] _regularExpressionStop reconise regular expression (stop). + */ + void appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop); + /** + * @brief Append a Token recognition (sub parsing). + * @param[in] _tokenIdParrent parrent Tocken id value. + * @param[in] _tokenId Tocken id value. + * @param[in] _regularExpression reconise regular expression. + */ + void appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression); + /** + * @brief Append a Token recognition (sub parsing) (section reconise start and stop with counting the number of start and stop). + * @param[in] _tokenIdParrent parrent Tocken id value. + * @param[in] _tokenId Tocken id value. + * @param[in] _regularExpressionStart reconise regular expression (start). + * @param[in] _regularExpressionStop reconise regular expression (stop). + */ + void appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop); + LexerResult interprete(const std::string& _data); }; } diff --git a/eci/eci.cpp b/eci/eci.cpp index a3877b2..066e31c 100644 --- a/eci/eci.cpp +++ b/eci/eci.cpp @@ -13,17 +13,18 @@ int main(int argc, char** argv) { etk::log::setLevel(etk::log::logLevelDebug); - etk::log::setLevel(etk::log::logLevelInfo); + //etk::log::setLevel(etk::log::logLevelInfo); ECI_INFO("Start Application interpeter languages"); if (argc<=1) { ECI_CRITICAL("need the file to parse"); return -1; } eci::ParserCpp tmpParser; - std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]); + //std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]); //std::string data = "alpha /* plop */ test"; //std::string data = "pp \n // qdfqdfsdf \nde"; - tmpParser.parse(data); + //tmpParser.parse(data); + tmpParser.parse(etk::FSNodeReadAllData(argv[1])); return 0; } \ No newline at end of file diff --git a/eci/lang/ParserCpp.cpp b/eci/lang/ParserCpp.cpp index 2e71794..ab1d19f 100644 --- a/eci/lang/ParserCpp.cpp +++ b/eci/lang/ParserCpp.cpp @@ -7,19 +7,81 @@ */ #include +#include enum cppTokenList { - tokenCppMultilineComment, - tokenCppSingleLineComment, - tokenCppString, + tokenCppCommentMultiline, + tokenCppCommentSingleLine, + tokenCppPreProcessor, + tokenCppPreProcessorIf, + tokenCppPreProcessorElse, + tokenCppPreProcessorEndif, + tokenCppPreProcessorIfdef, + tokenCppPreProcessorIfndef, + tokenCppPreProcessorDefine, + tokenCppPreProcessorWarning, + tokenCppPreProcessorError, + tokenCppPreProcessorInclude, + tokenCppPreProcessorImport, + tokenCppPreProcessorSectionPthese, + tokenCppStringDoubleQuote, + tokenCppStringSimpleQuote, + tokenCppSectionBrace, + tokenCppSectionPthese, + tokenCppSectionHook, + tokenCppBranch, + tokenCppSystem, + tokenCppType, + tokenCppVisibility, + tokenCppContener, + tokenCppTypeDef, + tokenCppAuto, + tokenCppNullptr, + tokenCppSystemDefine, + tokenCppNumericValue, + tokenCppBoolean, + tokenCppCondition, + tokenCppAssignation, + tokenCppString, + tokenCppSeparator, }; eci::ParserCpp::ParserCpp() { - m_lexer.append(tokenCppMultilineComment, "/\\*.*\\*/"); - m_lexer.append(tokenCppSingleLineComment, "//.*$"); - m_lexer.append(82939, "/\\*.*"); - m_lexer.append(tokenCppString, "[a-z]"); + m_lexer.append(tokenCppCommentMultiline, "/\\*(.|\\r|\\n)*?(\\*/|\\0)"); + m_lexer.append(tokenCppCommentSingleLine, "//.*"); + m_lexer.append(tokenCppPreProcessor, "#(.|\\\\[\\\\\\n])*"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIf, "\\bif\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorElse, "\\belse\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorEndif, "\\bendif\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfdef, "\\bifdef\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfndef, "\\bifndef\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorDefine, "\\bdefine\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorWarning, "\\bwarning\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorError, "\\berror\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorInclude, "\\binclude\\b"); + m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorImport, "\\bimport\\b"); // specific to c++ interpreted + m_lexer.appendSubSection(tokenCppPreProcessor, tokenCppPreProcessorSectionPthese, "\\(", "\\)"); + m_lexer.append(tokenCppStringDoubleQuote, "\"(.|\\\\[\\\\\"])*?\""); + m_lexer.append(tokenCppStringSimpleQuote, "'\\?.'"); + m_lexer.appendSection(tokenCppSectionBrace, "\\{", "\\}"); + m_lexer.appendSection(tokenCppSectionPthese, "\\(", "\\)"); + m_lexer.appendSection(tokenCppSectionHook, "\\[", "\\]"); + m_lexer.append(tokenCppBranch, "\\b(return|goto|if|else|case|default|break|continue|while|do|for)\\b"); + m_lexer.append(tokenCppSystem, "\\b(new|delete|try|catch)\\b"); + m_lexer.append(tokenCppType, "\\b(bool|char(16_t|32_t)?|double|float|u?int(8|16|32|64|128)?(_t)?|long|short|signed|size_t|unsigned|void|(I|U)(8|16|32|64|128))\\b"); + m_lexer.append(tokenCppVisibility, "\\b(inline|const|virtual|private|public|protected|friend|const|extern|register|static|volatile)\\b"); + m_lexer.append(tokenCppContener, "\\b(class|namespace|struct|union|enum)\\b"); + m_lexer.append(tokenCppTypeDef, "\\btypedef\\b"); + m_lexer.append(tokenCppAuto, "\\bauto\\b"); + m_lexer.append(tokenCppNullptr, "\\b(NULL|nullptr)\\b"); + m_lexer.append(tokenCppSystemDefine, "\\b__(LINE|DATA|FILE|func|TIME|STDC)__\\b"); + m_lexer.append(tokenCppNumericValue, "\\b(((0(x|X)[0-9a-fA-F]*)|(\\d+\\.?\\d*|\\.\\d+)((e|E)(\\+|\\-)?\\d+)?)(L|l|UL|ul|u|U|F|f)?)\\b"); + m_lexer.append(tokenCppBoolean, "\\b(true|false)\\b"); + m_lexer.append(tokenCppCondition, "==|>=|<=|!=|<|>|&&|\\|\\|"); + m_lexer.append(tokenCppAssignation, "(=|\\*|/|-|+|&)"); + m_lexer.append(tokenCppString, "\\w+"); + m_lexer.append(tokenCppSeparator, "(;|,|::|:)"); } eci::ParserCpp::~ParserCpp() { @@ -28,6 +90,10 @@ eci::ParserCpp::~ParserCpp() { bool eci::ParserCpp::parse(const std::string& _data) { m_result = m_lexer.interprete(_data); + ECI_INFO("find :"); + for (auto &it : m_result.m_list) { + ECI_INFO(" start=" << it->getStartPos() << " stop=" << it->getStopPos() << " data='" <getStartPos(), it->getStopPos()-it->getStartPos()) << "'" ); + } return false; } diff --git a/tests/01_comment.c b/tests/01_comment.c index ce8d687..406305d 100644 --- a/tests/01_comment.c +++ b/tests/01_comment.c @@ -5,6 +5,7 @@ printf("Hello\n"); /* this is a comment */ printf("Hello\n"); printf("Hello\n"); // this is also a comment sayhello(); printf("Hello\n"); +printf("Hello\n"); /* this is a second comment */ printf("Hello\n"); void main() {}