[DEV] really better RegEx parser ==> nearly ended

This commit is contained in:
Edouard DUPIN 2014-07-30 22:03:54 +02:00
parent 27d1f599e6
commit 89784df428
2 changed files with 107 additions and 63 deletions

View File

@ -17,8 +17,8 @@
#include <memory> #include <memory>
#define TK_REG_EXP_DBG_MODE2 TK_HIDDEN #define TK_REG_EXP_DBG_MODE2 TK_HIDDEN
#define TK_REG_EXP_DBG_MODE TK_HIDDEN //#define TK_REG_EXP_DBG_MODE TK_HIDDEN
//#define TK_REG_EXP_DBG_MODE TK_VERBOSE #define TK_REG_EXP_DBG_MODE TK_VERBOSE
//#define TK_REG_EXP_DBG_MODE TK_DEBUG //#define TK_REG_EXP_DBG_MODE TK_DEBUG
//regular colors //regular colors
@ -96,17 +96,18 @@ normal mode :
[anjdi] or [a-gt-j] range [anjdi] or [a-gt-j] range
. dot [^\x00-\x08\x0A-\x1F\x7F] . dot [^\x00-\x08\x0A-\x1F\x7F]
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@ $ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
@ Previous
==> TODO : ==> TODO :
^in the [] invertion of the range element ^in the [] invertion of the range element
Sart of line Sart of line
force regexp to be the shortest. force regexp to be the shortest.
multiplicity : multiplicity :
* ==> {0, 2147483647} * ==> {0, 2147483647} (try to have the minimum size)
? ==> {0, 1} ? ==> {0, 1}
+ ==> {1, 2147483647} + ==> {1, 2147483647} (try to have the minimum size)
{x} ==> {x, x} {x} ==> {x, x} (try to have the minimum size)
{x,y} ==> {x, y} {x,y} ==> {x, y} (try to have the minimum size)
*/ */
/** /**
* @brief convertion table of every element in a regular expression. * @brief convertion table of every element in a regular expression.
@ -367,6 +368,13 @@ template<class CLASS_TYPE> class NodeValue : public Node<CLASS_TYPE> {
_property.setStatus(parseStatusNone); _property.setStatus(parseStatusNone);
return; return;
} }
if ( _property.getPositionStop() < 0
&& Node<CLASS_TYPE>::m_multipleMin == 0
&& _property.getMultiplicity() == 0) {
_property.setPositionStop(_property.getPositionStart());
_property.setStatus(parseStatusPartial);
return;
}
bool tmpFind = true; bool tmpFind = true;
int32_t findLen = 0; int32_t findLen = 0;
while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax
@ -520,7 +528,12 @@ template<class CLASS_TYPE> class NodeRangeValue : public Node<CLASS_TYPE> {
} }
}else { }else {
if (_property.getPositionStop() != -1) { if (_property.getPositionStop() != -1) {
if (_property.getMultiplicity() == 0) {
// simple optimisation ==> permit to remove parsing 1 cycle
_property.setStatus(parseStatusNone);
} else {
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
}
} else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) { } else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) {
_property.setPositionStop(_property.getPositionStart()); _property.setPositionStop(_property.getPositionStart());
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
@ -1024,7 +1037,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
findPartialNode = true; findPartialNode = true;
prop = _property.m_subProperty[jjj]; prop = _property.m_subProperty[jjj];
tmpCurrentPos = prop.getPositionStop(); tmpCurrentPos = prop.getPositionStop();
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end()); _property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
iii = jjj; iii = jjj;
break; break;
} }
@ -1040,7 +1053,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'"); TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'");
m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop); m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop);
if (prop.getStatus() == parseStatusNone) { if (prop.getStatus() == parseStatusNone) {
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None==="); TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None=== : " << prop);
// rewind the list: // rewind the list:
bool findPartialNode = false; bool findPartialNode = false;
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) { for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
@ -1048,7 +1061,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
findPartialNode = true; findPartialNode = true;
prop = _property.m_subProperty[jjj]; prop = _property.m_subProperty[jjj];
tmpCurrentPos = prop.getPositionStop(); tmpCurrentPos = prop.getPositionStop();
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end()); _property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
iii = jjj; iii = jjj;
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << ""); TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << "");
break; break;
@ -1175,10 +1188,42 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
_property.setStatus(parseStatusNone); _property.setStatus(parseStatusNone);
return; return;
} }
if (_property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) { bool haveSubPartial = false;
for (int64_t iii=_property.m_subProperty.size()-1; iii>=0; --iii) {
if (_property.m_subProperty[iii].getStatus() == parseStatusPartial) {
haveSubPartial = true;
break;
}
}
if ( haveSubPartial == false
&& _property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) {
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
return; return;
} }
if (haveSubPartial == true) {
TK_CRITICAL(" TODO ...");
// TODO : Really hard element ==> the current node might register the previous tree before rejecting parse ...
/*
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
if (_property.m_subProperty[jjj].getStatus() == parseStatusPartial) {
findPartialNode = true;
prop = _property.m_subProperty[jjj];
tmpCurrentPos = prop.getPositionStop();
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
iii = jjj;
break;
}
}
*/
} else {
if ( _property.getPositionStop() < 0
&& Node<CLASS_TYPE>::m_multipleMin == 0
&& _property.getMultiplicity() == 0) {
_property.setPositionStop(_property.getPositionStart());
_property.setStatus(parseStatusPartial);
return;
}
}
_property.setStatus(parseStatusFull); _property.setStatus(parseStatusFull);
bool tmpFind = true; bool tmpFind = true;
while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax
@ -1628,17 +1673,13 @@ template<class CLASS_TYPE> class RegExp {
} }
regexp::FindProperty prop; regexp::FindProperty prop;
prop.setPositionStart(_startPos); prop.setPositionStart(_startPos);
bool needOneMoreCycle = true;
while (needOneMoreCycle == true) {
needOneMoreCycle = false;
m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop); m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop);
if ( prop.getStatus() == regexp::parseStatusFull if ( prop.getStatus() == regexp::parseStatusFull
|| prop.getStatus() == regexp::parseStatusPartial ) { || prop.getStatus() == regexp::parseStatusPartial ) {
findLen = prop.getFindLen(); findLen = prop.getFindLen();
if ( _escapeChar != 0
&& _startPos>0) {
if (_escapeChar == (char32_t)_SearchIn[_startPos-1]) {
//==> detected escape char ==> try find again ...
return false;
}
}
// Check end : // Check end :
if (m_notEndWithChar == true) { if (m_notEndWithChar == true) {
if (_startPos+findLen < (int64_t)_SearchIn.size() ) { if (_startPos+findLen < (int64_t)_SearchIn.size() ) {
@ -1651,14 +1692,17 @@ template<class CLASS_TYPE> class RegExp {
&& tmpVal <= '9' ) && tmpVal <= '9' )
|| ( tmpVal == '_' ) ) { || ( tmpVal == '_' ) ) {
// go on the next char ... // go on the next char ...
return false; needOneMoreCycle = true;
} }
} }
} }
if (needOneMoreCycle == false) {
m_areaFind.start = _startPos; m_areaFind.start = _startPos;
m_areaFind.stop = _startPos + findLen; m_areaFind.stop = _startPos + findLen;
return true; return true;
} }
}
}
return false; return false;
}; };