From 07470e11b9fd002a7489b9fe019670eb8a9eccfe Mon Sep 17 00:00:00 2001
From: Edouard DUPIN <yui.heero@gmail.com>
Date: Tue, 14 Mar 2017 21:07:15 +0100
Subject: [PATCH] [DEV] add the ^ \w \d \l \s in the [...] element

---
 etk/Buffer.hpp |  3 +-
 etk/RegExp.cpp | 28 ++++++++++++++---
 etk/RegExp.hpp | 82 +++++++++++++++++++++++++++++++++++---------------
 3 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/etk/Buffer.hpp b/etk/Buffer.hpp
index d3ab671..a9f8964 100644
--- a/etk/Buffer.hpp
+++ b/etk/Buffer.hpp
@@ -7,7 +7,8 @@
  */
 #pragma once
 
-#include <etk/os/FSNode.h>
+#include <etk/os/FSNode.hpp>
+#include <etk/debug.hpp>
 
 // minimum gapSize when allocated
 #define GAP_SIZE_MIN		(80)
diff --git a/etk/RegExp.cpp b/etk/RegExp.cpp
index f32f224..37c8865 100644
--- a/etk/RegExp.cpp
+++ b/etk/RegExp.cpp
@@ -223,7 +223,7 @@ int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t
 	pos++;
 	// find size ...
 	while (pos < (int64_t)_data.size() ) {
-		if(_data[pos]==regexpOpcodeBracketOut) {
+		if(_data[pos] == regexpOpcodeBracketOut) {
 			// Find the end of the [...]
 			// just return the size inside
 			int32_t sizeInside = pos - _startPos -1 ;
@@ -232,9 +232,29 @@ int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t
 				return 0;
 			}
 			return sizeInside;
-		} else if(    _data[pos] != regexpOpcodeTo
-		           && _data[pos] > 0xFF ) {
-			TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << (char)_data[pos] << "'");
+		} else if (    _data[pos] == regexpOpcodeStartOfLine
+		            || _data[pos] == regexpOpcodeDigit
+		            || _data[pos] == regexpOpcodeLetter
+		            || _data[pos] == regexpOpcodeSpace
+		            || _data[pos] == regexpOpcodeWord
+		            || _data[pos] == regexpOpcodeTo) {
+			// nothing to do ... it is permited
+		} else if(_data[pos] > 0xFF ) {
+			std::string displayElement;
+			if (_data[pos] == regexpOpcodeStartOfLine) {
+				displayElement = "^";
+			} else if (_data[pos] == regexpOpcodeDigitNot) {
+				displayElement = "\\D";
+			} else if (_data[pos] == regexpOpcodeLetterNot) {
+				displayElement = "\\L";
+			} else if (_data[pos] == regexpOpcodeSpaceNot) {
+				displayElement = "\\S";
+			} else if (_data[pos] == regexpOpcodeWordNot) {
+				displayElement = "\\W";
+			} else {
+				displayElement = (char)_data[pos];
+			}
+			TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << displayElement << "'");
 			return 0;
 		}
 		pos++;
diff --git a/etk/RegExp.hpp b/etk/RegExp.hpp
index 9a04eca..3dc292c 100644
--- a/etk/RegExp.hpp
+++ b/etk/RegExp.hpp
@@ -98,16 +98,15 @@ normal mode :
 	\W                  NOT a "Word" character                  [^a-zA-Z0-9_]
 	\@                  at the start or the end                 not in the parsing of element ==> check if \w is not present   (other regExp will be <> ...)
 	\e                  end-of-file / end-of-data               [\x00] ==> not counted
-	[anjdi] or [a-gt-j] range
+	[anjdi] or [a-gt-j] range: It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected
 	.                   dot                                     [^\x00]
 	$                   End / Start of line of line             ==> ce sera un truc suplémentaire comme le \@
 	@                   Previous
 ==> TODO :
-	^in the []			invertion of the range element
 	Sart of line
 	force regexp to be the shortest.
 
-multiplicity :
+multiplicity:
 	*     ==> {0, 2147483647} (try to have the minimum size)
 	?     ==> {0, 1}
 	+     ==> {1, 2147483647} (try to have the minimum size)
@@ -202,7 +201,7 @@ class FindProperty {
 		void setPositionStop(int64_t _newPos) {
 			m_positionStop = _newPos;
 			if (m_positionStop < m_positionStart) {
-				TK_CRITICAL("set volontary a stop position before end : " << this);
+				TK_CRITICAL("set volontary a stop position before end : " << this << " start=" << m_positionStart << " stop=" << m_positionStop);
 			}
 		}
 		uint32_t getMultiplicity() const {
@@ -638,9 +637,32 @@ template<class CLASS_TYPE> class NodeBracket : public NodeRangeValue<CLASS_TYPE>
 			
 			char32_t lastElement = 0;
 			bool multipleElement = false;
-			//
+			// Parse the elements:
 			for (int32_t kkk=0; kkk<(int64_t)Node<CLASS_TYPE>::m_regExpData.size(); kkk++) {
-				if (    Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeTo
+				if (    kkk == 0
+				     && Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeStartOfLine) {
+					// Check if the user request an invertion check:
+					NodeRangeValue<CLASS_TYPE>::setInvertion(true);
+				} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeStartOfLine) {
+					TK_ERROR("Unsupported Element '^' inside the [...] not at the first element");
+					return 0;
+				} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeDigit) {
+					NodeRangeValue<CLASS_TYPE>::addRange('0', '9');
+				} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeLetter) {
+					NodeRangeValue<CLASS_TYPE>::addRange('a', 'z');
+					NodeRangeValue<CLASS_TYPE>::addRange('A', 'Z');
+				} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeSpace) {
+					NodeRangeValue<CLASS_TYPE>::addValue(' ');
+					NodeRangeValue<CLASS_TYPE>::addValue('\t');
+					NodeRangeValue<CLASS_TYPE>::addValue('\n');
+					NodeRangeValue<CLASS_TYPE>::addValue('\r');
+					NodeRangeValue<CLASS_TYPE>::addValue('\f');
+					NodeRangeValue<CLASS_TYPE>::addValue('\v');
+				} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeWord) {
+					NodeRangeValue<CLASS_TYPE>::addRange('a', 'z');
+					NodeRangeValue<CLASS_TYPE>::addRange('A', 'Z');
+					NodeRangeValue<CLASS_TYPE>::addRange('0', '9');
+				} else if (    Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeTo
 				     && multipleElement == true) {
 					TK_ERROR("Can not have 2 consecutive - in [...]");
 					return 0;
@@ -938,7 +960,6 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
 							m_subNode.push_back(tmpNode);
 						}
 						break;
-					
 					default: {
 							elementSize = getLenOfNormal(Node<CLASS_TYPE>::m_regExpData, pos);
 							for (int64_t kkk=pos; kkk<pos+elementSize; kkk++) {
@@ -1303,7 +1324,7 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
  *     \w                    "Word" character               [a-zA-Z0-9_]
  *     \W                    NOT a "Word" character         [^a-zA-Z0-9_]
  *     \@                    at the start or the end        not in the parsing of element ==> check if \w is not present   (other regExp will be <> ...)
- *     [anjdi] or [a-gt-j]   range
+ *     [anjdi] or [a-gt-j]   range. It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected
  *     .                     dot                            [^\x00-\x08\x0A-\x1F\x7F]
  * ==> TODO :
  *     $                     End / Start of line of line    ==> ce sera un truc suplé comme le \@
@@ -1382,6 +1403,7 @@ template<class CLASS_TYPE> class RegExp {
 		 * @brief Set a new regular expression matching
 		 * @param[in] _exp the new expression to search
 		 */
+		// TODO : Add an error ...
 		void compile(const std::string &_exp) {
 			if (_exp.size() != 0) {
 				TK_REG_DEBUG("normal string parse : '" << _exp << "'");
@@ -1491,22 +1513,20 @@ template<class CLASS_TYPE> class RegExp {
 				return;
 			}
 			// need to check if all () [] and {} is well set ...
-			if (false == checkGoodPosition(tmpExp) ) {
+			if (checkGoodPosition(tmpExp) == false) {
 				return;
 			}
 			
 			//TK_REG_DEBUG("Main element :" << createString(tmpExp) );
-			if (    tmpExp.size()>0
-			     && tmpExp[0] == regexpOpcodeNoChar)
-			{
+			if (    tmpExp.size() > 0
+			     && tmpExp[0] == regexpOpcodeNoChar) {
 				//TK_DEBUG("=> must not begin with char");
 				m_notBeginWithChar = true;
 				// remove element
 				tmpExp.erase(tmpExp.begin());
 			}
-			if (    tmpExp.size()>0
-			     && tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar)
-			{
+			if (    tmpExp.size() > 0
+			     && tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar) {
 				//TK_DEBUG("=> must not end with char");
 				m_notEndWithChar = true;
 				// remove element
@@ -1573,7 +1593,7 @@ template<class CLASS_TYPE> class RegExp {
 				int64_t maxlen = _endPos-iii;
 				TK_REG_DEBUG("----------------------------------------------");
 				TK_REG_DEBUG("parse element : " << iii << " : '" << _SearchIn[iii] << "'");
-				if (true == m_notBeginWithChar) {
+				if (m_notBeginWithChar == true) {
 					if (iii>0) {
 						char32_t tmpVal = _SearchIn[iii-1];
 						if(    (    tmpVal >= 'a'
@@ -1780,7 +1800,8 @@ template<class CLASS_TYPE> class RegExp {
 				return false;
 			}
 			//TK_DEBUG(" ==> Find ELEMENT : ([{");
-			// case dependent : 
+			// case dependent:
+			int32_t localOffset = 0;
 			if (    curentCode == regexpOpcodeBracketIn
 			     || curentCode == regexpOpcodeBracetIn) {
 				while(_pos<(int64_t)_tmpExp.size()) {
@@ -1790,7 +1811,7 @@ template<class CLASS_TYPE> class RegExp {
 						return true;
 					} else {
 						// otherwise, we check the error in the element ...
-						char *find = NULL;
+						char *find = nullptr;
 						switch (_tmpExp[_pos]) {
 							case regexpOpcodePTheseIn:		find = (char*)"(";			break;
 							case regexpOpcodeBracketIn:		find = (char*)"[";			break;
@@ -1803,26 +1824,37 @@ template<class CLASS_TYPE> class RegExp {
 							case regexpOpcodeQuestion:		find = (char*)"?";			break;
 							case regexpOpcodePlus:			find = (char*)"+";			break;
 							case regexpOpcodePipe:			find = (char*)"|";			break;
-							case regexpOpcodeStartOfLine:	find = (char*)"^";			break;
 							case regexpOpcodeEndOfLine:		find = (char*)"$";			break;
-							case regexpOpcodeDigit:			find = (char*)"\\d";		break;
 							case regexpOpcodeDigitNot:		find = (char*)"\\D";		break;
-							case regexpOpcodeLetter:		find = (char*)"\\l";		break;
 							case regexpOpcodeLetterNot:		find = (char*)"\\L";		break;
-							case regexpOpcodeSpace:			find = (char*)"\\s";		break;
 							case regexpOpcodeSpaceNot:		find = (char*)"\\S";		break;
-							case regexpOpcodeWord:			find = (char*)"\\w";		break;
 							case regexpOpcodeWordNot:		find = (char*)"\\W";		break;
 							case regexpOpcodeNoChar:		find = (char*)"\\@";		break;
+							case regexpOpcodeStartOfLine:
+								if (    endCode == regexpOpcodeBracetOut
+								     || localOffset != 0) {
+									find = (char*)"^";			break;
+								}
 							default:													break;
 						}
-						if (NULL != find) {
+						// Specific element forbiden for (...) but not for [...]
+						if (endCode == regexpOpcodeBracetOut) {
+							switch (_tmpExp[_pos]) {
+								case regexpOpcodeDigit:			find = (char*)"\\d";		break;
+								case regexpOpcodeLetter:		find = (char*)"\\l";		break;
+								case regexpOpcodeSpace:			find = (char*)"\\s";		break;
+								case regexpOpcodeWord:			find = (char*)"\\w";		break;
+								default:													break;
+							}
+						}
+						if (find != nullptr) {
 							(void)input;
 							TK_ERROR("can not have : '" << find << "' inside " << input << " element");
 							return false;
 						}
 					}
 					_pos++;
+					localOffset++;
 				}
 			} else {
 				while(_pos< (int64_t)_tmpExp.size()) {
@@ -1839,7 +1871,7 @@ template<class CLASS_TYPE> class RegExp {
 						if(    _tmpExp[_pos] == regexpOpcodePTheseIn
 						    || _tmpExp[_pos] == regexpOpcodeBracketIn
 						    || _tmpExp[_pos] == regexpOpcodeBracetIn ) {
-							if (false==checkGoodPosition(_tmpExp, _pos) ) {
+							if (checkGoodPosition(_tmpExp, _pos) == false ) {
 								return false;
 							}
 						}