[DEV] start parsing C++ files

2014-12-29 23:06:04 +01:00 · 2014-12-29 23:06:04 +01:00 · 3b85d09ffa
commit 3b85d09ffa
parent d92b12b3e1
5 changed files with 361 additions and 20 deletions
--- a/eci/Lexer.cpp
+++ b/eci/Lexer.cpp
@ -18,20 +18,144 @@ eci::Lexer::~Lexer() {
 }
 void eci::Lexer::append(int32_t _tokenId, const std::string& _regularExpression) {
-	m_searchList.insert(std::make_pair(_tokenId, etk::RegExp<std::string>(_regularExpression)));
+	ECI_INFO("CPP lexer add : '" << _regularExpression << "'");
-	etk::RegExp<std::string>(_regularExpression).display();
+	try {
 		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeBase>(_tokenId, _regularExpression)));
 	} catch (std::exception e){
 		ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what());
 	}
 }
 void eci::Lexer::appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) {
 	ECI_INFO("CPP lexer add section : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'");
 	try {
 		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSection>(_tokenId, _regularExpressionStart, _regularExpressionStop)));
 	} catch (std::exception e){
 		ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what());
 	}
 }
 void eci::Lexer::appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression) {
 	ECI_INFO("CPP lexer add sub : [" << _tokenIdParrent << "] '" << _regularExpression << "'");
 	try {
 		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSubBase>(_tokenId, _tokenIdParrent, _regularExpression)));
 	} catch (std::exception e){
 		ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what());
 	}
 }
 void eci::Lexer::appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) {
 	ECI_INFO("CPP lexer add section sub : [" << _tokenIdParrent << "] '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'");
 	try {
 		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSubSection>(_tokenId, _tokenIdParrent, _regularExpressionStart, _regularExpressionStop)));
 	} catch (std::exception e){
 		ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what());
 	}
 }
 eci::LexerResult eci::Lexer::interprete(const std::string& _data) {
-	eci::LexerResult result;
+	eci::LexerResult result(_data);
 	ECI_INFO("Parse : \n" << _data);
 	for (auto &it : m_searchList) {
-		ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated());
+		//ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated());
-		if (it.second.parse(_data, 0, _data.size()) == true) {
+		if (it.second == nullptr) {
-			ECI_INFO("    match [" << it.second.start() << ".." << it.second.stop() << "] ");
+			continue;
-			ECI_INFO("        ==> '" << std::string(_data, it.second.start(), it.second.stop()-it.second.start()) << "'");
+		}
 		if (it.second->isSubParse() == true) {
 			continue;
 		}
 		if (result.m_list.size() == 0) {
 			result.m_list = it.second->parse(_data, 0, _data.size());
 		} else {
 			int32_t start = 0;
 			auto itList(result.m_list.begin());
 			while (itList != result.m_list.end()) {
 				if (*itList == nullptr) {
 					ECI_TODO("remove null shared_ptr");
 					++itList;
 					continue;
 				}
 				if ((*itList)->getStartPos() == start) {
 					// nothing to do ..
 					start = (*itList)->getStopPos();
 					++itList;
 					continue;
 				}
 				std::vector<std::shared_ptr<eci::LexerNode>> res = it.second->parse(_data, start, (*itList)->getStartPos());
 				// append it in the buffer:
 				if (res.size() > 0) {
 					int32_t currentPos = std::distance(result.m_list.begin(), itList) + res.size() ;
 					result.m_list.insert(itList, res.begin(), res.end());
 					itList = result.m_list.begin() + currentPos;
 				}
 				start = (*itList)->getStopPos();
 				++itList;
 			}
 			// Do the last element :
 			if (start < _data.size()) {
 				std::vector<std::shared_ptr<eci::LexerNode>> res = it.second->parse(_data, start, _data.size());
 				for (auto &itRes : res) {
 					result.m_list.push_back(itRes);
 				}
 			}
 		}
 	}
 	return result;
 }
 std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeBase::parse(const std::string& _data, int32_t _start, int32_t _stop) {
 	std::vector<std::shared_ptr<eci::LexerNode>> result;
 	ECI_DEBUG("parse : " << getValue());
 	while (true) {
 		std::smatch resultMatch;
 		std::regex_constants::match_flag_type flags = std::regex_constants::match_any;
 		//APPL_DEBUG("find data at : start=" << _start << " stop=" << _stop << " regex='" << m_regexValue << "'");
 		if ((int64_t)_stop <= (int64_t)_data.size()) {
 			char val = _data[_stop];
 			if (    val != '\n'
 			     && val != '\r') {
 				//after last char ==> not end of line ($ would not work))
 				flags |= std::regex_constants::match_not_eol;
 			}
 			if (!(    ('a' <= val && val <= 'z')
 			       || ('A' <= val && val <= 'Z')
 			       || ('0' <= val && val <= '9')
 			       || val == '_')) {
 				flags |= std::regex_constants::match_not_eow;
 			}
 		}
 		if (_start>0) {
 			flags |= std::regex_constants::match_prev_avail;
 		}
 		std::regex_search(_data.begin()+_start, _data.begin()+_stop, resultMatch, regex, flags);
 		if (resultMatch.size() > 0) {
 			int32_t start = std::distance(_data.begin(), resultMatch[0].first);
 			int32_t stop = std::distance(_data.begin(), resultMatch[0].second);
 			ECI_DEBUG("    find data at : start=" << start << " stop=" << stop << " data='" <<std::string(_data, start, stop-start) << "'" );
 			_start = stop;
 			result.push_back(std::make_shared<eci::LexerNode>(m_tockenId, start, stop));
 		} else {
 			break;
 		}
 	}
 	return result;
 }
 std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSection::parse(const std::string& _data, int32_t _start, int32_t _stop) {
 	std::vector<std::shared_ptr<eci::LexerNode>> result;
 	ECI_TODO("later 1");
 	return result;
 }
 std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSubBase::parse(const std::string& _data, int32_t _start, int32_t _stop) {
 	std::vector<std::shared_ptr<eci::LexerNode>> result;
 	ECI_TODO("later 2");
 	return result;
 }
 std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSubSection::parse(const std::string& _data, int32_t _start, int32_t _stop) {
 	std::vector<std::shared_ptr<eci::LexerNode>> result;
 	ECI_TODO("later 3");
 	return result;
 }
--- a/eci/Lexer.h
+++ b/eci/Lexer.h
@ -11,21 +11,170 @@
 #include <etk/types.h>
 #include <etk/stdTools.h>
-#include <etk/RegExp.h>
+#include <regex>
 #include <map>
 #include <vector>
 namespace eci {
-	using LexerResult = std::vector<std::pair<int32_t, std::string>>;
+	class LexerNode {
 		public:
 			LexerNode(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) :
 			  m_tockenId(_tockenId),
 			  m_startPos(_startPos),
 			  m_stopPos(_stopPos) {
 			}
 			virtual ~LexerNode() {};
 			int32_t m_tockenId;
 			int32_t m_startPos;
 			int32_t m_stopPos;
 			int32_t getStartPos() {
 				return m_startPos;
 			}
 			int32_t getStopPos() {
 				return m_stopPos;
 			}
 	};
 	class LexerNodeContainer : public LexerNode {
 		public:
 			LexerNodeContainer(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) :
 			  LexerNode(_tockenId, _startPos, _stopPos) {
 			}
 			virtual ~LexerNodeContainer() {};
 			std::vector<std::shared_ptr<eci::LexerNode>> m_list;
 	};
 	class LexerResult {
 		private:
 			std::string m_data;
 		public:
 			LexerResult(const std::string& _data="") :
 			  m_data(_data) {
 			}
 			~LexerResult() {};
 			std::vector<std::shared_ptr<eci::LexerNode>> m_list;
 	};
 	class Lexer {
 		private:
-			std::map<int32_t, etk::RegExp<std::string>> m_searchList;
+			#define TYPE_UNKNOW (0)
 			#define TYPE_BASE (1)
 			#define TYPE_SECTION (2)
 			#define TYPE_SUB_BASE (3)
 			#define TYPE_SUB_SECTION (4)
 			class Type {
 				protected:
 					int32_t m_tockenId;
 					std::string m_regexValue;
 				public:
 					Type(int32_t _tockenId) :
 					  m_tockenId(_tockenId) {}
 					virtual ~Type() {}
 					virtual int32_t getType() {
 						return TYPE_UNKNOW;
 					}
 					int32_t getTockenId() {
 						return m_tockenId;
 					}
 					virtual std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop)=0;
 					std::string getValue() {
 						return m_regexValue;
 					};
 					virtual bool isSubParse() {
 						return false;
 					}
 			};
 			class TypeBase : public Type {
 				public:
 					std::regex regex;
 					TypeBase(int32_t _tockenId, const std::string& _regex="") :
 					  Type(_tockenId),
 					  regex(_regex, std::regex_constants::optimize | std::regex_constants::ECMAScript) {
 						m_regexValue = _regex;
 					}
 					virtual int32_t getType() {
 						return TYPE_BASE;
 					}
 					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
 			};
 			class TypeSection : public Type {
 				public:
 					std::regex regexStart;
 					std::regex regexStop;
 					TypeSection(int32_t _tockenId, const std::string& _regexStart="", const std::string& _regexStop="") :
 					  Type(_tockenId),
 					  regexStart(_regexStart, std::regex_constants::optimize | std::regex_constants::ECMAScript),
 					  regexStop(_regexStop, std::regex_constants::optimize | std::regex_constants::ECMAScript) {
 						m_regexValue = _regexStart + " -> " + _regexStop;
 					}
 					virtual int32_t getType() {
 						return TYPE_SECTION;
 					}
 					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
 			};
 			class TypeSubBase : public TypeBase {
 				public:
 					int32_t parrent;
 					TypeSubBase(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regex="") :
 					  TypeBase(_tockenId, _regex),
 					  parrent(_tokenIdParrent) {}
 					virtual int32_t getType() {
 						return TYPE_SUB_BASE;
 					}
 					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
 					bool isSubParse() {
 						return true;
 					}
 			};
 			class TypeSubSection : public TypeSection {
 				public:
 					int32_t parrent;
 					TypeSubSection(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regexStart="", const std::string& _regexStop="") :
 					  TypeSection(_tockenId, _regexStart, _regexStop),
 					  parrent(_tokenIdParrent) {}
 					virtual int32_t getType() {
 						return TYPE_SUB_SECTION;
 					}
 					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
 					bool isSubParse() {
 						return true;
 					}
 			};
 			std::map<int32_t, std::shared_ptr<eci::Lexer::Type>> m_searchList;
 		public:
 			Lexer();
 			~Lexer();
 			/**
 			 * @brief Append a Token recognition.
 			 * @param[in] _tokenId Tocken id value.
 			 * @param[in] _regularExpression reconise regular expression.
 			 */
 			void append(int32_t _tokenId, const std::string& _regularExpression);
 			/**
 			 * @brief Append a Token recognition (section reconise start and stop with counting the number of start and stop).
 			 * @param[in] _tokenId Tocken id value.
 			 * @param[in] _regularExpressionStart reconise regular expression (start).
 			 * @param[in] _regularExpressionStop reconise regular expression (stop).
 			 */
 			void appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop);
 			/**
 			 * @brief Append a Token recognition (sub parsing).
 			 * @param[in] _tokenIdParrent parrent Tocken id value.
 			 * @param[in] _tokenId Tocken id value.
 			 * @param[in] _regularExpression reconise regular expression.
 			 */
 			void appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression);
 			/**
 			 * @brief Append a Token recognition (sub parsing) (section reconise start and stop with counting the number of start and stop).
 			 * @param[in] _tokenIdParrent parrent Tocken id value.
 			 * @param[in] _tokenId Tocken id value.
 			 * @param[in] _regularExpressionStart reconise regular expression (start).
 			 * @param[in] _regularExpressionStop reconise regular expression (stop).
 			 */
 			void appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop);
 			LexerResult interprete(const std::string& _data);
 	};
 }
--- a/eci/eci.cpp
+++ b/eci/eci.cpp
@ -13,17 +13,18 @@
 int main(int argc, char** argv) {
 	etk::log::setLevel(etk::log::logLevelDebug);
-	etk::log::setLevel(etk::log::logLevelInfo);
+	//etk::log::setLevel(etk::log::logLevelInfo);
 	ECI_INFO("Start Application interpeter languages");
 	if (argc<=1) {
 		ECI_CRITICAL("need the file to parse");
 		return -1;
 	}
 	eci::ParserCpp tmpParser;
-	std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]);
+	//std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]);
 	//std::string data = "alpha /* plop */ test";
 	//std::string data = "pp \n // qdfqdfsdf \nde";
-	tmpParser.parse(data);
+	//tmpParser.parse(data);
 	tmpParser.parse(etk::FSNodeReadAllData(argv[1]));
 	return 0;
 }
--- a/eci/lang/ParserCpp.cpp
+++ b/eci/lang/ParserCpp.cpp
@ -7,19 +7,81 @@
 */
 #include <eci/lang/ParserCpp.h>
 #include <eci/debug.h>
 enum cppTokenList {
-	tokenCppMultilineComment,
+	tokenCppCommentMultiline,
-	tokenCppSingleLineComment,
+	tokenCppCommentSingleLine,
-	tokenCppString,
+	tokenCppPreProcessor,
 	tokenCppPreProcessorIf,
 	tokenCppPreProcessorElse,
 	tokenCppPreProcessorEndif,
 	tokenCppPreProcessorIfdef,
 	tokenCppPreProcessorIfndef,
 	tokenCppPreProcessorDefine,
 	tokenCppPreProcessorWarning,
 	tokenCppPreProcessorError,
 	tokenCppPreProcessorInclude,
 	tokenCppPreProcessorImport,
 	tokenCppPreProcessorSectionPthese,
 	tokenCppStringDoubleQuote,
 	tokenCppStringSimpleQuote,
 	tokenCppSectionBrace,
 	tokenCppSectionPthese,
 	tokenCppSectionHook,
 	tokenCppBranch,
 	tokenCppSystem,
 	tokenCppType,
 	tokenCppVisibility,
 	tokenCppContener,
 	tokenCppTypeDef,
 	tokenCppAuto,
 	tokenCppNullptr,
 	tokenCppSystemDefine,
 	tokenCppNumericValue,
 	tokenCppBoolean,
 	tokenCppCondition,
 	tokenCppAssignation,
 	tokenCppString,
 	tokenCppSeparator,
 };
 eci::ParserCpp::ParserCpp() {
-	m_lexer.append(tokenCppMultilineComment, "/\\*.*\\*/");
+	m_lexer.append(tokenCppCommentMultiline, "/\\*(.|\\r|\\n)*?(\\*/|\\0)");
-	m_lexer.append(tokenCppSingleLineComment, "//.*$");
+	m_lexer.append(tokenCppCommentSingleLine, "//.*");
-	m_lexer.append(82939, "/\\*.*");
+	m_lexer.append(tokenCppPreProcessor, "#(.|\\\\[\\\\\\n])*");
-	m_lexer.append(tokenCppString, "[a-z]");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIf, "\\bif\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorElse, "\\belse\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorEndif, "\\bendif\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfdef, "\\bifdef\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfndef, "\\bifndef\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorDefine, "\\bdefine\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorWarning, "\\bwarning\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorError, "\\berror\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorInclude, "\\binclude\\b");
 	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorImport, "\\bimport\\b"); // specific to c++ interpreted
 	m_lexer.appendSubSection(tokenCppPreProcessor, tokenCppPreProcessorSectionPthese, "\\(", "\\)");
 	m_lexer.append(tokenCppStringDoubleQuote, "\"(.|\\\\[\\\\\"])*?\"");
 	m_lexer.append(tokenCppStringSimpleQuote, "'\\?.'");
 	m_lexer.appendSection(tokenCppSectionBrace, "\\{", "\\}");
 	m_lexer.appendSection(tokenCppSectionPthese, "\\(", "\\)");
 	m_lexer.appendSection(tokenCppSectionHook, "\\[", "\\]");
 	m_lexer.append(tokenCppBranch, "\\b(return|goto|if|else|case|default|break|continue|while|do|for)\\b");
 	m_lexer.append(tokenCppSystem, "\\b(new|delete|try|catch)\\b");
 	m_lexer.append(tokenCppType, "\\b(bool|char(16_t|32_t)?|double|float|u?int(8|16|32|64|128)?(_t)?|long|short|signed|size_t|unsigned|void|(I|U)(8|16|32|64|128))\\b");
 	m_lexer.append(tokenCppVisibility, "\\b(inline|const|virtual|private|public|protected|friend|const|extern|register|static|volatile)\\b");
 	m_lexer.append(tokenCppContener, "\\b(class|namespace|struct|union|enum)\\b");
 	m_lexer.append(tokenCppTypeDef, "\\btypedef\\b");
 	m_lexer.append(tokenCppAuto, "\\bauto\\b");
 	m_lexer.append(tokenCppNullptr, "\\b(NULL|nullptr)\\b");
 	m_lexer.append(tokenCppSystemDefine, "\\b__(LINE|DATA|FILE|func|TIME|STDC)__\\b");
 	m_lexer.append(tokenCppNumericValue, "\\b(((0(x|X)[0-9a-fA-F]*)|(\\d+\\.?\\d*|\\.\\d+)((e|E)(\\+|\\-)?\\d+)?)(L|l|UL|ul|u|U|F|f)?)\\b");
 	m_lexer.append(tokenCppBoolean, "\\b(true|false)\\b");
 	m_lexer.append(tokenCppCondition, "==|>=|<=|!=|<|>|&&|\\|\\|");
 	m_lexer.append(tokenCppAssignation, "(=|\\*|/|-|+|&)");
 	m_lexer.append(tokenCppString, "\\w+");
 	m_lexer.append(tokenCppSeparator, "(;|,|::|:)");
 }
 eci::ParserCpp::~ParserCpp() {
@ -28,6 +90,10 @@ eci::ParserCpp::~ParserCpp() {
 bool eci::ParserCpp::parse(const std::string& _data) {
 	m_result = m_lexer.interprete(_data);
 		ECI_INFO("find :");
 	for (auto &it : m_result.m_list) {
 		ECI_INFO("    start=" << it->getStartPos() << " stop=" << it->getStopPos() << " data='" <<std::string(_data, it->getStartPos(), it->getStopPos()-it->getStartPos()) << "'" );
 	}
 	return false;
 }
--- a/tests/01_comment.c
+++ b/tests/01_comment.c
@ -5,6 +5,7 @@ printf("Hello\n"); /* this is a comment */ printf("Hello\n");
 printf("Hello\n");
 // this is also a comment sayhello();
 printf("Hello\n");
 printf("Hello\n"); /* this is a second comment */ printf("Hello\n");
 void main() {}