From 3b85d09ffa1bf2a3adec42e6b6d92ceda4b3c41a Mon Sep 17 00:00:00 2001
From: Edouard DUPIN <yui.heero@gmail.com>
Date: Mon, 29 Dec 2014 23:06:04 +0100
Subject: [PATCH] [DEV] start parsing C++ files

---
 eci/Lexer.cpp          | 138 ++++++++++++++++++++++++++++++++++--
 eci/Lexer.h            | 155 ++++++++++++++++++++++++++++++++++++++++-
 eci/eci.cpp            |   7 +-
 eci/lang/ParserCpp.cpp |  80 +++++++++++++++++++--
 tests/01_comment.c     |   1 +
 5 files changed, 361 insertions(+), 20 deletions(-)

diff --git a/eci/Lexer.cpp b/eci/Lexer.cpp
index 7b21495..3bd7bce 100644
--- a/eci/Lexer.cpp
+++ b/eci/Lexer.cpp
@@ -18,20 +18,144 @@ eci::Lexer::~Lexer() {
 }
 
 void eci::Lexer::append(int32_t _tokenId, const std::string& _regularExpression) {
-	m_searchList.insert(std::make_pair(_tokenId, etk::RegExp<std::string>(_regularExpression)));
-	etk::RegExp<std::string>(_regularExpression).display();
+	ECI_INFO("CPP lexer add : '" << _regularExpression << "'");
+	try {
+		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeBase>(_tokenId, _regularExpression)));
+	} catch (std::exception e){
+		ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what());
+	}
 }
 
+void eci::Lexer::appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) {
+	ECI_INFO("CPP lexer add section : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'");
+	try {
+		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSection>(_tokenId, _regularExpressionStart, _regularExpressionStop)));
+	} catch (std::exception e){
+		ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what());
+	}
+}
+
+void eci::Lexer::appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression) {
+	ECI_INFO("CPP lexer add sub : [" << _tokenIdParrent << "] '" << _regularExpression << "'");
+	try {
+		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSubBase>(_tokenId, _tokenIdParrent, _regularExpression)));
+	} catch (std::exception e){
+		ECI_ERROR(" create reg exp : '" << _regularExpression << "' : what:" << e.what());
+	}
+}
+
+void eci::Lexer::appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop) {
+	ECI_INFO("CPP lexer add section sub : [" << _tokenIdParrent << "] '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "'");
+	try {
+		m_searchList.insert(std::make_pair(_tokenId, std::make_shared<eci::Lexer::TypeSubSection>(_tokenId, _tokenIdParrent, _regularExpressionStart, _regularExpressionStop)));
+	} catch (std::exception e){
+		ECI_ERROR(" create reg exp : '" << _regularExpressionStart << "' .. '" << _regularExpressionStop << "' : what:" << e.what());
+	}
+}
+
+
 eci::LexerResult eci::Lexer::interprete(const std::string& _data) {
-	eci::LexerResult result;
+	eci::LexerResult result(_data);
 	ECI_INFO("Parse : \n" << _data);
 	for (auto &it : m_searchList) {
-		ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated());
-		if (it.second.parse(_data, 0, _data.size()) == true) {
-			ECI_INFO("    match [" << it.second.start() << ".." << it.second.stop() << "] ");
-			ECI_INFO("        ==> '" << std::string(_data, it.second.start(), it.second.stop()-it.second.start()) << "'");
+		//ECI_INFO("Parse RegEx : " << it.first << " : " << it.second.getRegExDecorated());
+		if (it.second == nullptr) {
+			continue;
+		}
+		if (it.second->isSubParse() == true) {
+			continue;
+		}
+		if (result.m_list.size() == 0) {
+			result.m_list = it.second->parse(_data, 0, _data.size());
+		} else {
+			int32_t start = 0;
+			auto itList(result.m_list.begin());
+			while (itList != result.m_list.end()) {
+				if (*itList == nullptr) {
+					ECI_TODO("remove null shared_ptr");
+					++itList;
+					continue;
+				}
+				if ((*itList)->getStartPos() == start) {
+					// nothing to do ..
+					start = (*itList)->getStopPos();
+					++itList;
+					continue;
+				}
+				std::vector<std::shared_ptr<eci::LexerNode>> res = it.second->parse(_data, start, (*itList)->getStartPos());
+				// append it in the buffer:
+				if (res.size() > 0) {
+					int32_t currentPos = std::distance(result.m_list.begin(), itList) + res.size() ;
+					result.m_list.insert(itList, res.begin(), res.end());
+					itList = result.m_list.begin() + currentPos;
+				}
+				start = (*itList)->getStopPos();
+				++itList;
+			}
+			// Do the last element :
+			if (start < _data.size()) {
+				std::vector<std::shared_ptr<eci::LexerNode>> res = it.second->parse(_data, start, _data.size());
+				for (auto &itRes : res) {
+					result.m_list.push_back(itRes);
+				}
+			}
 		}
 	}
 	return result;
 }
 
+std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeBase::parse(const std::string& _data, int32_t _start, int32_t _stop) {
+	std::vector<std::shared_ptr<eci::LexerNode>> result;
+	ECI_DEBUG("parse : " << getValue());
+	while (true) {
+		std::smatch resultMatch;
+		std::regex_constants::match_flag_type flags = std::regex_constants::match_any;
+		//APPL_DEBUG("find data at : start=" << _start << " stop=" << _stop << " regex='" << m_regexValue << "'");
+		if ((int64_t)_stop <= (int64_t)_data.size()) {
+			char val = _data[_stop];
+			if (    val != '\n'
+			     && val != '\r') {
+				//after last char ==> not end of line ($ would not work))
+				flags |= std::regex_constants::match_not_eol;
+			}
+			if (!(    ('a' <= val && val <= 'z')
+			       || ('A' <= val && val <= 'Z')
+			       || ('0' <= val && val <= '9')
+			       || val == '_')) {
+				flags |= std::regex_constants::match_not_eow;
+			}
+		}
+		if (_start>0) {
+			flags |= std::regex_constants::match_prev_avail;
+		}
+		std::regex_search(_data.begin()+_start, _data.begin()+_stop, resultMatch, regex, flags);
+		if (resultMatch.size() > 0) {
+			int32_t start = std::distance(_data.begin(), resultMatch[0].first);
+			int32_t stop = std::distance(_data.begin(), resultMatch[0].second);
+			ECI_DEBUG("    find data at : start=" << start << " stop=" << stop << " data='" <<std::string(_data, start, stop-start) << "'" );
+			_start = stop;
+			result.push_back(std::make_shared<eci::LexerNode>(m_tockenId, start, stop));
+		} else {
+			break;
+		}
+	}
+	return result;
+}
+
+std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSection::parse(const std::string& _data, int32_t _start, int32_t _stop) {
+	std::vector<std::shared_ptr<eci::LexerNode>> result;
+	ECI_TODO("later 1");
+	return result;
+}
+
+std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSubBase::parse(const std::string& _data, int32_t _start, int32_t _stop) {
+	std::vector<std::shared_ptr<eci::LexerNode>> result;
+	ECI_TODO("later 2");
+	return result;
+}
+
+std::vector<std::shared_ptr<eci::LexerNode>> eci::Lexer::TypeSubSection::parse(const std::string& _data, int32_t _start, int32_t _stop) {
+	std::vector<std::shared_ptr<eci::LexerNode>> result;
+	ECI_TODO("later 3");
+	return result;
+}
\ No newline at end of file
diff --git a/eci/Lexer.h b/eci/Lexer.h
index 6364045..3ea0bb9 100644
--- a/eci/Lexer.h
+++ b/eci/Lexer.h
@@ -11,21 +11,170 @@
 
 #include <etk/types.h>
 #include <etk/stdTools.h>
-#include <etk/RegExp.h>
+#include <regex>
 #include <map>
 #include <vector>
 
 
 namespace eci {
-	using LexerResult = std::vector<std::pair<int32_t, std::string>>;
+	class LexerNode {
+		public:
+			LexerNode(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) :
+			  m_tockenId(_tockenId),
+			  m_startPos(_startPos),
+			  m_stopPos(_stopPos) {
+				
+			}
+			virtual ~LexerNode() {};
+			int32_t m_tockenId;
+			int32_t m_startPos;
+			int32_t m_stopPos;
+			int32_t getStartPos() {
+				return m_startPos;
+			}
+			int32_t getStopPos() {
+				return m_stopPos;
+			}
+	};
+	class LexerNodeContainer : public LexerNode {
+		public:
+			LexerNodeContainer(int32_t _tockenId=-1, int32_t _startPos=-1, int32_t _stopPos=-1) :
+			  LexerNode(_tockenId, _startPos, _stopPos) {
+				
+			}
+			virtual ~LexerNodeContainer() {};
+			std::vector<std::shared_ptr<eci::LexerNode>> m_list;
+	};
+	class LexerResult {
+		private:
+			std::string m_data;
+		public:
+			LexerResult(const std::string& _data="") :
+			  m_data(_data) {
+				
+			}
+			~LexerResult() {};
+			std::vector<std::shared_ptr<eci::LexerNode>> m_list;
+	};
 	
 	class Lexer {
 		private:
-			std::map<int32_t, etk::RegExp<std::string>> m_searchList;
+			#define TYPE_UNKNOW (0)
+			#define TYPE_BASE (1)
+			#define TYPE_SECTION (2)
+			#define TYPE_SUB_BASE (3)
+			#define TYPE_SUB_SECTION (4)
+			class Type {
+				protected:
+					int32_t m_tockenId;
+					std::string m_regexValue;
+				public:
+					Type(int32_t _tockenId) :
+					  m_tockenId(_tockenId) {}
+					virtual ~Type() {}
+					virtual int32_t getType() {
+						return TYPE_UNKNOW;
+					}
+					int32_t getTockenId() {
+						return m_tockenId;
+					}
+					virtual std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop)=0;
+					std::string getValue() {
+						return m_regexValue;
+					};
+					virtual bool isSubParse() {
+						return false;
+					}
+			};
+			class TypeBase : public Type {
+				public:
+					std::regex regex;
+					TypeBase(int32_t _tockenId, const std::string& _regex="") :
+					  Type(_tockenId),
+					  regex(_regex, std::regex_constants::optimize | std::regex_constants::ECMAScript) {
+						m_regexValue = _regex;
+					}
+					virtual int32_t getType() {
+						return TYPE_BASE;
+					}
+					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
+			};
+			class TypeSection : public Type {
+				public:
+					std::regex regexStart;
+					std::regex regexStop;
+					TypeSection(int32_t _tockenId, const std::string& _regexStart="", const std::string& _regexStop="") :
+					  Type(_tockenId),
+					  regexStart(_regexStart, std::regex_constants::optimize | std::regex_constants::ECMAScript),
+					  regexStop(_regexStop, std::regex_constants::optimize | std::regex_constants::ECMAScript) {
+						m_regexValue = _regexStart + " -> " + _regexStop;
+					}
+					virtual int32_t getType() {
+						return TYPE_SECTION;
+					}
+					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
+			};
+			class TypeSubBase : public TypeBase {
+				public:
+					int32_t parrent;
+					TypeSubBase(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regex="") :
+					  TypeBase(_tockenId, _regex),
+					  parrent(_tokenIdParrent) {}
+					virtual int32_t getType() {
+						return TYPE_SUB_BASE;
+					}
+					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
+					bool isSubParse() {
+						return true;
+					}
+			};
+			class TypeSubSection : public TypeSection {
+				public:
+					int32_t parrent;
+					TypeSubSection(int32_t _tockenId, int32_t _tokenIdParrent=-1, const std::string& _regexStart="", const std::string& _regexStop="") :
+					  TypeSection(_tockenId, _regexStart, _regexStop),
+					  parrent(_tokenIdParrent) {}
+					virtual int32_t getType() {
+						return TYPE_SUB_SECTION;
+					}
+					std::vector<std::shared_ptr<eci::LexerNode>> parse(const std::string& _data, int32_t _start, int32_t _stop);
+					bool isSubParse() {
+						return true;
+					}
+			};
+			std::map<int32_t, std::shared_ptr<eci::Lexer::Type>> m_searchList;
 		public:
 			Lexer();
 			~Lexer();
+			/**
+			 * @brief Append a Token recognition.
+			 * @param[in] _tokenId Tocken id value.
+			 * @param[in] _regularExpression reconise regular expression.
+			 */
 			void append(int32_t _tokenId, const std::string& _regularExpression);
+			/**
+			 * @brief Append a Token recognition (section reconise start and stop with counting the number of start and stop).
+			 * @param[in] _tokenId Tocken id value.
+			 * @param[in] _regularExpressionStart reconise regular expression (start).
+			 * @param[in] _regularExpressionStop reconise regular expression (stop).
+			 */
+			void appendSection(int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop);
+			/**
+			 * @brief Append a Token recognition (sub parsing).
+			 * @param[in] _tokenIdParrent parrent Tocken id value.
+			 * @param[in] _tokenId Tocken id value.
+			 * @param[in] _regularExpression reconise regular expression.
+			 */
+			void appendSub(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpression);
+			/**
+			 * @brief Append a Token recognition (sub parsing) (section reconise start and stop with counting the number of start and stop).
+			 * @param[in] _tokenIdParrent parrent Tocken id value.
+			 * @param[in] _tokenId Tocken id value.
+			 * @param[in] _regularExpressionStart reconise regular expression (start).
+			 * @param[in] _regularExpressionStop reconise regular expression (stop).
+			 */
+			void appendSubSection(int32_t _tokenIdParrent, int32_t _tokenId, const std::string& _regularExpressionStart, const std::string& _regularExpressionStop);
+			
 			LexerResult interprete(const std::string& _data);
 	};
 }
diff --git a/eci/eci.cpp b/eci/eci.cpp
index a3877b2..066e31c 100644
--- a/eci/eci.cpp
+++ b/eci/eci.cpp
@@ -13,17 +13,18 @@
 
 int main(int argc, char** argv) {
 	etk::log::setLevel(etk::log::logLevelDebug);
-	etk::log::setLevel(etk::log::logLevelInfo);
+	//etk::log::setLevel(etk::log::logLevelInfo);
 	ECI_INFO("Start Application interpeter languages");
 	if (argc<=1) {
 		ECI_CRITICAL("need the file to parse");
 		return -1;
 	}
 	eci::ParserCpp tmpParser;
-	std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]);
+	//std::string data = "/* plop */ \n int eee = 22; // error value \nint main(void) {\n return 0;\n}\n";//etk::FSNodeReadAllData(argv[1]);
 	//std::string data = "alpha /* plop */ test";
 	//std::string data = "pp \n // qdfqdfsdf \nde";
-	tmpParser.parse(data);
+	//tmpParser.parse(data);
+	tmpParser.parse(etk::FSNodeReadAllData(argv[1]));
 	
 	return 0;
 }
\ No newline at end of file
diff --git a/eci/lang/ParserCpp.cpp b/eci/lang/ParserCpp.cpp
index 2e71794..ab1d19f 100644
--- a/eci/lang/ParserCpp.cpp
+++ b/eci/lang/ParserCpp.cpp
@@ -7,19 +7,81 @@
  */
 
 #include <eci/lang/ParserCpp.h>
+#include <eci/debug.h>
 
 enum cppTokenList {
-	tokenCppMultilineComment,
-	tokenCppSingleLineComment,
-	tokenCppString,
+	tokenCppCommentMultiline,
+	tokenCppCommentSingleLine,
+	tokenCppPreProcessor,
+	tokenCppPreProcessorIf,
+	tokenCppPreProcessorElse,
+	tokenCppPreProcessorEndif,
+	tokenCppPreProcessorIfdef,
+	tokenCppPreProcessorIfndef,
+	tokenCppPreProcessorDefine,
+	tokenCppPreProcessorWarning,
+	tokenCppPreProcessorError,
+	tokenCppPreProcessorInclude,
+	tokenCppPreProcessorImport,
+	tokenCppPreProcessorSectionPthese,
 	
+	tokenCppStringDoubleQuote,
+	tokenCppStringSimpleQuote,
+	tokenCppSectionBrace,
+	tokenCppSectionPthese,
+	tokenCppSectionHook,
+	tokenCppBranch,
+	tokenCppSystem,
+	tokenCppType,
+	tokenCppVisibility,
+	tokenCppContener,
+	tokenCppTypeDef,
+	tokenCppAuto,
+	tokenCppNullptr,
+	tokenCppSystemDefine,
+	tokenCppNumericValue,
+	tokenCppBoolean,
+	tokenCppCondition,
+	tokenCppAssignation,
+	tokenCppString,
+	tokenCppSeparator,
 };
 
 eci::ParserCpp::ParserCpp() {
-	m_lexer.append(tokenCppMultilineComment, "/\\*.*\\*/");
-	m_lexer.append(tokenCppSingleLineComment, "//.*$");
-	m_lexer.append(82939, "/\\*.*");
-	m_lexer.append(tokenCppString, "[a-z]");
+	m_lexer.append(tokenCppCommentMultiline, "/\\*(.|\\r|\\n)*?(\\*/|\\0)");
+	m_lexer.append(tokenCppCommentSingleLine, "//.*");
+	m_lexer.append(tokenCppPreProcessor, "#(.|\\\\[\\\\\\n])*");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIf, "\\bif\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorElse, "\\belse\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorEndif, "\\bendif\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfdef, "\\bifdef\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorIfndef, "\\bifndef\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorDefine, "\\bdefine\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorWarning, "\\bwarning\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorError, "\\berror\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorInclude, "\\binclude\\b");
+	m_lexer.appendSub(tokenCppPreProcessor, tokenCppPreProcessorImport, "\\bimport\\b"); // specific to c++ interpreted
+	m_lexer.appendSubSection(tokenCppPreProcessor, tokenCppPreProcessorSectionPthese, "\\(", "\\)");
+	m_lexer.append(tokenCppStringDoubleQuote, "\"(.|\\\\[\\\\\"])*?\"");
+	m_lexer.append(tokenCppStringSimpleQuote, "'\\?.'");
+	m_lexer.appendSection(tokenCppSectionBrace, "\\{", "\\}");
+	m_lexer.appendSection(tokenCppSectionPthese, "\\(", "\\)");
+	m_lexer.appendSection(tokenCppSectionHook, "\\[", "\\]");
+	m_lexer.append(tokenCppBranch, "\\b(return|goto|if|else|case|default|break|continue|while|do|for)\\b");
+	m_lexer.append(tokenCppSystem, "\\b(new|delete|try|catch)\\b");
+	m_lexer.append(tokenCppType, "\\b(bool|char(16_t|32_t)?|double|float|u?int(8|16|32|64|128)?(_t)?|long|short|signed|size_t|unsigned|void|(I|U)(8|16|32|64|128))\\b");
+	m_lexer.append(tokenCppVisibility, "\\b(inline|const|virtual|private|public|protected|friend|const|extern|register|static|volatile)\\b");
+	m_lexer.append(tokenCppContener, "\\b(class|namespace|struct|union|enum)\\b");
+	m_lexer.append(tokenCppTypeDef, "\\btypedef\\b");
+	m_lexer.append(tokenCppAuto, "\\bauto\\b");
+	m_lexer.append(tokenCppNullptr, "\\b(NULL|nullptr)\\b");
+	m_lexer.append(tokenCppSystemDefine, "\\b__(LINE|DATA|FILE|func|TIME|STDC)__\\b");
+	m_lexer.append(tokenCppNumericValue, "\\b(((0(x|X)[0-9a-fA-F]*)|(\\d+\\.?\\d*|\\.\\d+)((e|E)(\\+|\\-)?\\d+)?)(L|l|UL|ul|u|U|F|f)?)\\b");
+	m_lexer.append(tokenCppBoolean, "\\b(true|false)\\b");
+	m_lexer.append(tokenCppCondition, "==|>=|<=|!=|<|>|&&|\\|\\|");
+	m_lexer.append(tokenCppAssignation, "(=|\\*|/|-|+|&)");
+	m_lexer.append(tokenCppString, "\\w+");
+	m_lexer.append(tokenCppSeparator, "(;|,|::|:)");
 }
 
 eci::ParserCpp::~ParserCpp() {
@@ -28,6 +90,10 @@ eci::ParserCpp::~ParserCpp() {
 
 bool eci::ParserCpp::parse(const std::string& _data) {
 	m_result = m_lexer.interprete(_data);
+		ECI_INFO("find :");
+	for (auto &it : m_result.m_list) {
+		ECI_INFO("    start=" << it->getStartPos() << " stop=" << it->getStopPos() << " data='" <<std::string(_data, it->getStartPos(), it->getStopPos()-it->getStartPos()) << "'" );
+	}
 	return false;
 }
 
diff --git a/tests/01_comment.c b/tests/01_comment.c
index ce8d687..406305d 100644
--- a/tests/01_comment.c
+++ b/tests/01_comment.c
@@ -5,6 +5,7 @@ printf("Hello\n"); /* this is a comment */ printf("Hello\n");
 printf("Hello\n");
 // this is also a comment sayhello();
 printf("Hello\n");
+printf("Hello\n"); /* this is a second comment */ printf("Hello\n");
 
 
 void main() {}