[DEV] really better RegEx parser ==> nearly ended
This commit is contained in:
parent
27d1f599e6
commit
89784df428
@ -83,31 +83,31 @@ std::string etk::regexp::createString(const std::vector<char32_t>& _data, int64_
|
|||||||
std::string output(ETK_BASH_COLOR_NORMAL);
|
std::string output(ETK_BASH_COLOR_NORMAL);
|
||||||
for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) {
|
for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) {
|
||||||
switch(_data[iii]) {
|
switch(_data[iii]) {
|
||||||
case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
|
case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
|
case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
|
case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
|
||||||
default:
|
default:
|
||||||
char plop[10];
|
char plop[10];
|
||||||
int8_t nb = u32char::convertUtf8(_data[iii], plop);
|
int8_t nb = u32char::convertUtf8(_data[iii], plop);
|
||||||
|
120
etk/RegExp.h
120
etk/RegExp.h
@ -17,8 +17,8 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#define TK_REG_EXP_DBG_MODE2 TK_HIDDEN
|
#define TK_REG_EXP_DBG_MODE2 TK_HIDDEN
|
||||||
#define TK_REG_EXP_DBG_MODE TK_HIDDEN
|
//#define TK_REG_EXP_DBG_MODE TK_HIDDEN
|
||||||
//#define TK_REG_EXP_DBG_MODE TK_VERBOSE
|
#define TK_REG_EXP_DBG_MODE TK_VERBOSE
|
||||||
//#define TK_REG_EXP_DBG_MODE TK_DEBUG
|
//#define TK_REG_EXP_DBG_MODE TK_DEBUG
|
||||||
|
|
||||||
//regular colors
|
//regular colors
|
||||||
@ -96,17 +96,18 @@ normal mode :
|
|||||||
[anjdi] or [a-gt-j] range
|
[anjdi] or [a-gt-j] range
|
||||||
. dot [^\x00-\x08\x0A-\x1F\x7F]
|
. dot [^\x00-\x08\x0A-\x1F\x7F]
|
||||||
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
|
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
|
||||||
|
@ Previous
|
||||||
==> TODO :
|
==> TODO :
|
||||||
^in the [] invertion of the range element
|
^in the [] invertion of the range element
|
||||||
Sart of line
|
Sart of line
|
||||||
force regexp to be the shortest.
|
force regexp to be the shortest.
|
||||||
|
|
||||||
multiplicity :
|
multiplicity :
|
||||||
* ==> {0, 2147483647}
|
* ==> {0, 2147483647} (try to have the minimum size)
|
||||||
? ==> {0, 1}
|
? ==> {0, 1}
|
||||||
+ ==> {1, 2147483647}
|
+ ==> {1, 2147483647} (try to have the minimum size)
|
||||||
{x} ==> {x, x}
|
{x} ==> {x, x} (try to have the minimum size)
|
||||||
{x,y} ==> {x, y}
|
{x,y} ==> {x, y} (try to have the minimum size)
|
||||||
*/
|
*/
|
||||||
/**
|
/**
|
||||||
* @brief convertion table of every element in a regular expression.
|
* @brief convertion table of every element in a regular expression.
|
||||||
@ -367,6 +368,13 @@ template<class CLASS_TYPE> class NodeValue : public Node<CLASS_TYPE> {
|
|||||||
_property.setStatus(parseStatusNone);
|
_property.setStatus(parseStatusNone);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if ( _property.getPositionStop() < 0
|
||||||
|
&& Node<CLASS_TYPE>::m_multipleMin == 0
|
||||||
|
&& _property.getMultiplicity() == 0) {
|
||||||
|
_property.setPositionStop(_property.getPositionStart());
|
||||||
|
_property.setStatus(parseStatusPartial);
|
||||||
|
return;
|
||||||
|
}
|
||||||
bool tmpFind = true;
|
bool tmpFind = true;
|
||||||
int32_t findLen = 0;
|
int32_t findLen = 0;
|
||||||
while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax
|
while( _property.getMultiplicity() < Node<CLASS_TYPE>::m_multipleMax
|
||||||
@ -520,7 +528,12 @@ template<class CLASS_TYPE> class NodeRangeValue : public Node<CLASS_TYPE> {
|
|||||||
}
|
}
|
||||||
}else {
|
}else {
|
||||||
if (_property.getPositionStop() != -1) {
|
if (_property.getPositionStop() != -1) {
|
||||||
_property.setStatus(parseStatusFull);
|
if (_property.getMultiplicity() == 0) {
|
||||||
|
// simple optimisation ==> permit to remove parsing 1 cycle
|
||||||
|
_property.setStatus(parseStatusNone);
|
||||||
|
} else {
|
||||||
|
_property.setStatus(parseStatusFull);
|
||||||
|
}
|
||||||
} else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) {
|
} else if (_property.getMultiplicity() == Node<CLASS_TYPE>::m_multipleMin) {
|
||||||
_property.setPositionStop(_property.getPositionStart());
|
_property.setPositionStop(_property.getPositionStart());
|
||||||
_property.setStatus(parseStatusFull);
|
_property.setStatus(parseStatusFull);
|
||||||
@ -1024,7 +1037,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
|||||||
findPartialNode = true;
|
findPartialNode = true;
|
||||||
prop = _property.m_subProperty[jjj];
|
prop = _property.m_subProperty[jjj];
|
||||||
tmpCurrentPos = prop.getPositionStop();
|
tmpCurrentPos = prop.getPositionStop();
|
||||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end());
|
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
|
||||||
iii = jjj;
|
iii = jjj;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1040,7 +1053,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
|||||||
TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'");
|
TK_REG_EXP_DBG_MODE2(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") data='" << autoStr(std::string(_data, tmpCurrentPos, _lenMax-tmpCurrentPos)) << "'");
|
||||||
m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop);
|
m_subNode[iii]->parse(_data, tmpCurrentPos, _lenMax, prop);
|
||||||
if (prop.getStatus() == parseStatusNone) {
|
if (prop.getStatus() == parseStatusNone) {
|
||||||
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None===");
|
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=" << iii << "/" << m_subNode.size() << ") ===None=== : " << prop);
|
||||||
// rewind the list:
|
// rewind the list:
|
||||||
bool findPartialNode = false;
|
bool findPartialNode = false;
|
||||||
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
|
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
|
||||||
@ -1048,7 +1061,7 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
|||||||
findPartialNode = true;
|
findPartialNode = true;
|
||||||
prop = _property.m_subProperty[jjj];
|
prop = _property.m_subProperty[jjj];
|
||||||
tmpCurrentPos = prop.getPositionStop();
|
tmpCurrentPos = prop.getPositionStop();
|
||||||
_property.m_subProperty.erase(_property.m_subProperty.begin()+iii-1, _property.m_subProperty.end());
|
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
|
||||||
iii = jjj;
|
iii = jjj;
|
||||||
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << "");
|
TK_REG_EXP_DBG_MODE(" " << levelSpace(Node<CLASS_TYPE>::m_nodeLevel) << " (elem=?/" << m_subNode.size() << ") == rewind at " << iii << "");
|
||||||
break;
|
break;
|
||||||
@ -1175,10 +1188,42 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
|
|||||||
_property.setStatus(parseStatusNone);
|
_property.setStatus(parseStatusNone);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (_property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) {
|
bool haveSubPartial = false;
|
||||||
|
for (int64_t iii=_property.m_subProperty.size()-1; iii>=0; --iii) {
|
||||||
|
if (_property.m_subProperty[iii].getStatus() == parseStatusPartial) {
|
||||||
|
haveSubPartial = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( haveSubPartial == false
|
||||||
|
&& _property.getMultiplicity() >= Node<CLASS_TYPE>::m_multipleMax) {
|
||||||
_property.setStatus(parseStatusFull);
|
_property.setStatus(parseStatusFull);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (haveSubPartial == true) {
|
||||||
|
TK_CRITICAL(" TODO ...");
|
||||||
|
// TODO : Really hard element ==> the current node might register the previous tree before rejecting parse ...
|
||||||
|
/*
|
||||||
|
for (int64_t jjj=_property.m_subProperty.size()-1; jjj>=0; --jjj) {
|
||||||
|
if (_property.m_subProperty[jjj].getStatus() == parseStatusPartial) {
|
||||||
|
findPartialNode = true;
|
||||||
|
prop = _property.m_subProperty[jjj];
|
||||||
|
tmpCurrentPos = prop.getPositionStop();
|
||||||
|
_property.m_subProperty.erase(_property.m_subProperty.begin()+jjj, _property.m_subProperty.end());
|
||||||
|
iii = jjj;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
} else {
|
||||||
|
if ( _property.getPositionStop() < 0
|
||||||
|
&& Node<CLASS_TYPE>::m_multipleMin == 0
|
||||||
|
&& _property.getMultiplicity() == 0) {
|
||||||
|
_property.setPositionStop(_property.getPositionStart());
|
||||||
|
_property.setStatus(parseStatusPartial);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
_property.setStatus(parseStatusFull);
|
_property.setStatus(parseStatusFull);
|
||||||
bool tmpFind = true;
|
bool tmpFind = true;
|
||||||
while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax
|
while ( _property.getMultiplicity() <= Node<CLASS_TYPE>::m_multipleMax
|
||||||
@ -1628,36 +1673,35 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
}
|
}
|
||||||
regexp::FindProperty prop;
|
regexp::FindProperty prop;
|
||||||
prop.setPositionStart(_startPos);
|
prop.setPositionStart(_startPos);
|
||||||
m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop);
|
bool needOneMoreCycle = true;
|
||||||
if ( prop.getStatus() == regexp::parseStatusFull
|
while (needOneMoreCycle == true) {
|
||||||
|| prop.getStatus() == regexp::parseStatusPartial ) {
|
needOneMoreCycle = false;
|
||||||
findLen = prop.getFindLen();
|
m_exprRootNode.parse(_SearchIn, _startPos, maxlen, prop);
|
||||||
if ( _escapeChar != 0
|
if ( prop.getStatus() == regexp::parseStatusFull
|
||||||
&& _startPos>0) {
|
|| prop.getStatus() == regexp::parseStatusPartial ) {
|
||||||
if (_escapeChar == (char32_t)_SearchIn[_startPos-1]) {
|
findLen = prop.getFindLen();
|
||||||
//==> detected escape char ==> try find again ...
|
// Check end :
|
||||||
return false;
|
if (m_notEndWithChar == true) {
|
||||||
}
|
if (_startPos+findLen < (int64_t)_SearchIn.size() ) {
|
||||||
}
|
char32_t tmpVal = _SearchIn[_startPos+findLen];
|
||||||
// Check end :
|
if( ( tmpVal >= 'a'
|
||||||
if (m_notEndWithChar == true) {
|
&& tmpVal <= 'z' )
|
||||||
if (_startPos+findLen < (int64_t)_SearchIn.size() ) {
|
|| ( tmpVal >= 'A'
|
||||||
char32_t tmpVal = _SearchIn[_startPos+findLen];
|
&& tmpVal <= 'Z' )
|
||||||
if( ( tmpVal >= 'a'
|
|| ( tmpVal >= '0'
|
||||||
&& tmpVal <= 'z' )
|
&& tmpVal <= '9' )
|
||||||
|| ( tmpVal >= 'A'
|
|| ( tmpVal == '_' ) ) {
|
||||||
&& tmpVal <= 'Z' )
|
// go on the next char ...
|
||||||
|| ( tmpVal >= '0'
|
needOneMoreCycle = true;
|
||||||
&& tmpVal <= '9' )
|
}
|
||||||
|| ( tmpVal == '_' ) ) {
|
|
||||||
// go on the next char ...
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (needOneMoreCycle == false) {
|
||||||
|
m_areaFind.start = _startPos;
|
||||||
|
m_areaFind.stop = _startPos + findLen;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
m_areaFind.start = _startPos;
|
|
||||||
m_areaFind.stop = _startPos + findLen;
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user