[DEV] really better RegEx parser ==> nearly ended
This commit is contained in:
parent
27d1f599e6
commit
89784df428
@ -83,31 +83,31 @@ std::string etk::regexp::createString(const std::vector<char32_t>& _data, int64_
|
||||
std::string output(ETK_BASH_COLOR_NORMAL);
|
||||
for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) {
|
||||
switch(_data[iii]) {
|
||||
case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
|
||||
case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
|
||||
default:
|
||||
char plop[10];
|
||||
int8_t nb = u32char::convertUtf8(_data[iii], plop);
|
||||
|
120
etk/RegExp.h
120
etk/RegExp.h
@ -17,8 +17,8 @@
|
||||
#include <memory>
|
||||
|
||||
#define TK_REG_EXP_DBG_MODE2 TK_HIDDEN
|
||||
#define TK_REG_EXP_DBG_MODE TK_HIDDEN
|
||||
//#define TK_REG_EXP_DBG_MODE TK_VERBOSE
|
||||
//#define TK_REG_EXP_DBG_MODE TK_HIDDEN
|
||||
#define TK_REG_EXP_DBG_MODE TK_VERBOSE
|
||||
//#define TK_REG_EXP_DBG_MODE TK_DEBUG
|
||||
|
||||
//regular colors
|
||||
@ -96,17 +96,18 @@ normal mode :
|
||||
[anjdi] or [a-gt-j] range
|
||||
. dot [^\x00-\x08\x0A-\x1F\x7F]
|
||||
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
|
||||
@ Previous
|
||||
==> TODO :
|
||||
^in the [] invertion of the range element
|
||||
Sart of line
|
||||
force regexp to be the shortest.
|
||||
|
||||
multiplicity :
|
||||
* ==> {0, 2147483647}
|
||||
* ==> {0, 2147483647} (try to have the minimum size)
|
||||
? ==> {0, 1}
|
||||
+ ==> {1, 2147483647}
|
||||
{x} ==> {x, x}
|
||||
{x,y} ==> {x, y}
|
||||
+ ==> {1, 2147483647} (try to have the minimum size)
|
||||
{x} ==> {x, x} (try to have the minimum size)
|
||||
{x,y} ==> {x, y} (try to have the minimum size)
|
||||
*/
|
||||
/**
|
||||
* @brief convertion table of every element in a regular expression.
|
||||
@ -367,6 +368,13 @@ template<class CLASS_TYPE> class NodeValue : public Node<CLASS_TYPE> {
|
||||
_property.setStatus(parseStatusNone);
|
||||
return;
|
||||
}
|
||||
if ( _property.getPositionStop() < 0
|
||||
&& Node<CLASS_TYPE>::m_multipleMin == 0
|
||||
&& _property.getMultiplicity() == 0) {
|
||||
_property.setPositionStop(_property.getPositionStart());
|
||||
_property.setStatus(parseStatusPartial);
|
||||
return;
|
||||
}
|
||||
bool tmpFind = true;
|
||||
int32_t findLen = 0;
|
||||
while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax
|
||||
@ -520,7 +528,12 @@ template<class CLASS_TYPE> class NodeRangeValue : public Node<CLASS_TYPE> {
|
||||
}
|
||||
}else {
|
||||
if (_property.getPositionStop() != -1) {
|
||||
_property.setStatus(parseStatusFull);
|
||||
if (_property.getMultiplicity() == 0) {
|
||||
// simple optimisation ==> permit to remove parsing 1 cycle
|
||||
_property.setStatus(parseStatusNone);
|
||||
} else {
|
||||
_property.setStatus(parseStatusFull);
|
||||
}
|
||||
} else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) {
|
||||
_property.setPositionStop(_property.getPositionStart());
|
||||
_property.setStatus(parseStatusFull);
|
||||
@ -1024,7 +1037,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
||||
findPartialNode = true;
|
||||
prop = _property.m_subProperty[jjj];
|
||||
tmpCurrentPos = prop.getPositionStop();
|
||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end());
|
||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
|
||||
iii = jjj;
|
||||
break;
|
||||
}
|
||||
@ -1040,7 +1053,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
||||
TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'");
|
||||
m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop);
|
||||
if (prop.getStatus() == parseStatusNone) {
|
||||
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None===");
|
||||
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None=== : " << prop);
|
||||
// rewind the list:
|
||||
bool findPartialNode = false;
|
||||
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
|
||||
@ -1048,7 +1061,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
||||
findPartialNode = true;
|
||||
prop = _property.m_subProperty[jjj];
|
||||
tmpCurrentPos = prop.getPositionStop();
|
||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end());
|
||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
|
||||
iii = jjj;
|
||||
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << "");
|
||||
break;
|
||||
@ -1175,10 +1188,42 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
|
||||
_property.setStatus(parseStatusNone);
|
||||
return;
|
||||
}
|
||||
if (_property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) {
|
||||
bool haveSubPartial = false;
|
||||
for (int64_t iii=_property.m_subProperty.size()-1; iii>=0; --iii) {
|
||||
if (_property.m_subProperty[iii].getStatus() == parseStatusPartial) {
|
||||
haveSubPartial = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( haveSubPartial == false
|
||||
&& _property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) {
|
||||
_property.setStatus(parseStatusFull);
|
||||
return;
|
||||
}
|
||||
if (haveSubPartial == true) {
|
||||
TK_CRITICAL(" TODO ...");
|
||||
// TODO : Really hard element ==> the current node might register the previous tree before rejecting parse ...
|
||||
/*
|
||||
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
|
||||
if (_property.m_subProperty[jjj].getStatus() == parseStatusPartial) {
|
||||
findPartialNode = true;
|
||||
prop = _property.m_subProperty[jjj];
|
||||
tmpCurrentPos = prop.getPositionStop();
|
||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
|
||||
iii = jjj;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
if ( _property.getPositionStop() < 0
|
||||
&& Node<CLASS_TYPE>::m_multipleMin == 0
|
||||
&& _property.getMultiplicity() == 0) {
|
||||
_property.setPositionStop(_property.getPositionStart());
|
||||
_property.setStatus(parseStatusPartial);
|
||||
return;
|
||||
}
|
||||
}
|
||||
_property.setStatus(parseStatusFull);
|
||||
bool tmpFind = true;
|
||||
while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax
|
||||
@ -1628,36 +1673,35 @@ template<class CLASS_TYPE> class RegExp {
|
||||
}
|
||||
regexp::FindProperty prop;
|
||||
prop.setPositionStart(_startPos);
|
||||
m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop);
|
||||
if ( prop.getStatus() == regexp::parseStatusFull
|
||||
|| prop.getStatus() == regexp::parseStatusPartial ) {
|
||||
findLen = prop.getFindLen();
|
||||
if ( _escapeChar != 0
|
||||
&& _startPos>0) {
|
||||
if (_escapeChar == (char32_t)_SearchIn[_startPos-1]) {
|
||||
//==> detected escape char ==> try find again ...
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Check end :
|
||||
if (m_notEndWithChar == true) {
|
||||
if (_startPos+findLen < (int64_t)_SearchIn.size() ) {
|
||||
char32_t tmpVal = _SearchIn[_startPos+findLen];
|
||||
if( ( tmpVal >= 'a'
|
||||
&& tmpVal <= 'z' )
|
||||
|| ( tmpVal >= 'A'
|
||||
&& tmpVal <= 'Z' )
|
||||
|| ( tmpVal >= '0'
|
||||
&& tmpVal <= '9' )
|
||||
|| ( tmpVal == '_' ) ) {
|
||||
// go on the next char ...
|
||||
return false;
|
||||
bool needOneMoreCycle = true;
|
||||
while (needOneMoreCycle == true) {
|
||||
needOneMoreCycle = false;
|
||||
m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop);
|
||||
if ( prop.getStatus() == regexp::parseStatusFull
|
||||
|| prop.getStatus() == regexp::parseStatusPartial ) {
|
||||
findLen = prop.getFindLen();
|
||||
// Check end :
|
||||
if (m_notEndWithChar == true) {
|
||||
if (_startPos+findLen < (int64_t)_SearchIn.size() ) {
|
||||
char32_t tmpVal = _SearchIn[_startPos+findLen];
|
||||
if( ( tmpVal >= 'a'
|
||||
&& tmpVal <= 'z' )
|
||||
|| ( tmpVal >= 'A'
|
||||
&& tmpVal <= 'Z' )
|
||||
|| ( tmpVal >= '0'
|
||||
&& tmpVal <= '9' )
|
||||
|| ( tmpVal == '_' ) ) {
|
||||
// go on the next char ...
|
||||
needOneMoreCycle = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (needOneMoreCycle == false) {
|
||||
m_areaFind.start = _startPos;
|
||||
m_areaFind.stop = _startPos + findLen;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
m_areaFind.start = _startPos;
|
||||
m_areaFind.stop = _startPos + findLen;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user