[DEV] really better RegEx parser ==> nearly ended

This commit is contained in:
Edouard DUPIN 2014-07-30 22:03:54 +02:00
parent 27d1f599e6
commit 89784df428
2 changed files with 107 additions and 63 deletions

View File

@ -83,31 +83,31 @@ std::string etk::regexp::createString(const std::vector<char32_t>& _data, int64_
std::string output(ETK_BASH_COLOR_NORMAL); std::string output(ETK_BASH_COLOR_NORMAL);
for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) { for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) {
switch(_data[iii]) { switch(_data[iii]) {
case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break; case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break; case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break; case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
default: default:
char plop[10]; char plop[10];
int8_t nb = u32char::convertUtf8(_data[iii], plop); int8_t nb = u32char::convertUtf8(_data[iii], plop);

View File

@ -17,8 +17,8 @@
#include <memory> #include <memory>
#define TK_REG_EXP_DBG_MODE2 TK_HIDDEN #define TK_REG_EXP_DBG_MODE2 TK_HIDDEN
#define TK_REG_EXP_DBG_MODE TK_HIDDEN //#define TK_REG_EXP_DBG_MODE TK_HIDDEN
//#define TK_REG_EXP_DBG_MODE TK_VERBOSE #define TK_REG_EXP_DBG_MODE TK_VERBOSE
//#define TK_REG_EXP_DBG_MODE TK_DEBUG //#define TK_REG_EXP_DBG_MODE TK_DEBUG
//regular colors //regular colors
@ -96,17 +96,18 @@ normal mode :
[anjdi] or [a-gt-j] range [anjdi] or [a-gt-j] range
. dot [^\x00-\x08\x0A-\x1F\x7F] . dot [^\x00-\x08\x0A-\x1F\x7F]
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@ $ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
@ Previous
==> TODO : ==> TODO :
^in the [] invertion of the range element ^in the [] invertion of the range element
Sart of line Sart of line
force regexp to be the shortest. force regexp to be the shortest.
multiplicity : multiplicity :
* ==> {0, 2147483647} * ==> {0, 2147483647} (try to have the minimum size)
? ==> {0, 1} ? ==> {0, 1}
+ ==> {1, 2147483647} + ==> {1, 2147483647} (try to have the minimum size)
{x} ==> {x, x} {x} ==> {x, x} (try to have the minimum size)
{x,y} ==> {x, y} {x,y} ==> {x, y} (try to have the minimum size)
*/ */
/** /**
* @brief convertion table of every element in a regular expression. * @brief convertion table of every element in a regular expression.
@ -367,6 +368,13 @@ template<class CLASS_TYPE> class NodeValue : public Node<CLASS_TYPE> {
_property.setStatus(parseStatusNone); _property.setStatus(parseStatusNone);
return; return;
} }
if ( _property.getPositionStop() < 0
&& Node<CLASS_TYPE>::m_multipleMin == 0
&& _property.getMultiplicity() == 0) {
_property.setPositionStop(_property.getPositionStart());
_property.setStatus(parseStatusPartial);
return;
}
bool tmpFind = true; bool tmpFind = true;
int32_t findLen = 0; int32_t findLen = 0;
while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax
@ -520,7 +528,12 @@ template<class CLASS_TYPE> class NodeRangeValue : public Node<CLASS_TYPE> {
} }
}else { }else {
if (_property.getPositionStop() != -1) { if (_property.getPositionStop() != -1) {
_property.setStatus(parseStatusFull); if (_property.getMultiplicity() == 0) {
// simple optimisation ==> permit to remove parsing 1 cycle
_property.setStatus(parseStatusNone);
} else {
_property.setStatus(parseStatusFull);
}
} else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) { } else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) {
_property.setPositionStop(_property.getPositionStart()); _property.setPositionStop(_property.getPositionStart());
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
@ -1024,7 +1037,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
findPartialNode = true; findPartialNode = true;
prop = _property.m_subProperty[jjj]; prop = _property.m_subProperty[jjj];
tmpCurrentPos = prop.getPositionStop(); tmpCurrentPos = prop.getPositionStop();
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end()); _property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
iii = jjj; iii = jjj;
break; break;
} }
@ -1040,7 +1053,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'"); TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'");
m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop); m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop);
if (prop.getStatus() == parseStatusNone) { if (prop.getStatus() == parseStatusNone) {
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None==="); TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None=== : " << prop);
// rewind the list: // rewind the list:
bool findPartialNode = false; bool findPartialNode = false;
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) { for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
@ -1048,7 +1061,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
findPartialNode = true; findPartialNode = true;
prop = _property.m_subProperty[jjj]; prop = _property.m_subProperty[jjj];
tmpCurrentPos = prop.getPositionStop(); tmpCurrentPos = prop.getPositionStop();
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end()); _property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
iii = jjj; iii = jjj;
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << ""); TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << "");
break; break;
@ -1175,10 +1188,42 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
_property.setStatus(parseStatusNone); _property.setStatus(parseStatusNone);
return; return;
} }
if (_property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) { bool haveSubPartial = false;
for (int64_t iii=_property.m_subProperty.size()-1; iii>=0; --iii) {
if (_property.m_subProperty[iii].getStatus() == parseStatusPartial) {
haveSubPartial = true;
break;
}
}
if ( haveSubPartial == false
&& _property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) {
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
return; return;
} }
if (haveSubPartial == true) {
TK_CRITICAL(" TODO ...");
// TODO : Really hard element ==> the current node might register the previous tree before rejecting parse ...
/*
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
if (_property.m_subProperty[jjj].getStatus() == parseStatusPartial) {
findPartialNode = true;
prop = _property.m_subProperty[jjj];
tmpCurrentPos = prop.getPositionStop();
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
iii = jjj;
break;
}
}
*/
} else {
if ( _property.getPositionStop() < 0
&& Node<CLASS_TYPE>::m_multipleMin == 0
&& _property.getMultiplicity() == 0) {
_property.setPositionStop(_property.getPositionStart());
_property.setStatus(parseStatusPartial);
return;
}
}
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
bool tmpFind = true; bool tmpFind = true;
while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax
@ -1628,36 +1673,35 @@ template<class CLASS_TYPE> class RegExp {
} }
regexp::FindProperty prop; regexp::FindProperty prop;
prop.setPositionStart(_startPos); prop.setPositionStart(_startPos);
m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop); bool needOneMoreCycle = true;
if ( prop.getStatus() == regexp::parseStatusFull while (needOneMoreCycle == true) {
|| prop.getStatus() == regexp::parseStatusPartial ) { needOneMoreCycle = false;
findLen = prop.getFindLen(); m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop);
if ( _escapeChar != 0 if ( prop.getStatus() == regexp::parseStatusFull
&& _startPos>0) { || prop.getStatus() == regexp::parseStatusPartial ) {
if (_escapeChar == (char32_t)_SearchIn[_startPos-1]) { findLen = prop.getFindLen();
//==> detected escape char ==> try find again ... // Check end :
return false; if (m_notEndWithChar == true) {
} if (_startPos+findLen < (int64_t)_SearchIn.size() ) {
} char32_t tmpVal = _SearchIn[_startPos+findLen];
// Check end : if( ( tmpVal >= 'a'
if (m_notEndWithChar == true) { && tmpVal <= 'z' )
if (_startPos+findLen < (int64_t)_SearchIn.size() ) { || ( tmpVal >= 'A'
char32_t tmpVal = _SearchIn[_startPos+findLen]; && tmpVal <= 'Z' )
if( ( tmpVal >= 'a' || ( tmpVal >= '0'
&& tmpVal <= 'z' ) && tmpVal <= '9' )
|| ( tmpVal >= 'A' || ( tmpVal == '_' ) ) {
&& tmpVal <= 'Z' ) // go on the next char ...
|| ( tmpVal >= '0' needOneMoreCycle = true;
&& tmpVal <= '9' ) }
|| ( tmpVal == '_' ) ) {
// go on the next char ...
return false;
} }
} }
if (needOneMoreCycle == false) {
m_areaFind.start = _startPos;
m_areaFind.stop = _startPos + findLen;
return true;
}
} }
m_areaFind.start = _startPos;
m_areaFind.stop = _startPos + findLen;
return true;
} }
return false; return false;
}; };