[DEV] add the ^ \w \d \l \s in the [...] element
This commit is contained in:
parent
bafe283402
commit
07470e11b9
@ -7,7 +7,8 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <etk/os/FSNode.h>
|
#include <etk/os/FSNode.hpp>
|
||||||
|
#include <etk/debug.hpp>
|
||||||
|
|
||||||
// minimum gapSize when allocated
|
// minimum gapSize when allocated
|
||||||
#define GAP_SIZE_MIN (80)
|
#define GAP_SIZE_MIN (80)
|
||||||
|
@ -223,7 +223,7 @@ int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t
|
|||||||
pos++;
|
pos++;
|
||||||
// find size ...
|
// find size ...
|
||||||
while (pos < (int64_t)_data.size() ) {
|
while (pos < (int64_t)_data.size() ) {
|
||||||
if(_data[pos]==regexpOpcodeBracketOut) {
|
if(_data[pos] == regexpOpcodeBracketOut) {
|
||||||
// Find the end of the [...]
|
// Find the end of the [...]
|
||||||
// just return the size inside
|
// just return the size inside
|
||||||
int32_t sizeInside = pos - _startPos -1 ;
|
int32_t sizeInside = pos - _startPos -1 ;
|
||||||
@ -232,9 +232,29 @@ int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return sizeInside;
|
return sizeInside;
|
||||||
} else if( _data[pos] != regexpOpcodeTo
|
} else if ( _data[pos] == regexpOpcodeStartOfLine
|
||||||
&& _data[pos] > 0xFF ) {
|
|| _data[pos] == regexpOpcodeDigit
|
||||||
TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << (char)_data[pos] << "'");
|
|| _data[pos] == regexpOpcodeLetter
|
||||||
|
|| _data[pos] == regexpOpcodeSpace
|
||||||
|
|| _data[pos] == regexpOpcodeWord
|
||||||
|
|| _data[pos] == regexpOpcodeTo) {
|
||||||
|
// nothing to do ... it is permited
|
||||||
|
} else if(_data[pos] > 0xFF ) {
|
||||||
|
std::string displayElement;
|
||||||
|
if (_data[pos] == regexpOpcodeStartOfLine) {
|
||||||
|
displayElement = "^";
|
||||||
|
} else if (_data[pos] == regexpOpcodeDigitNot) {
|
||||||
|
displayElement = "\\D";
|
||||||
|
} else if (_data[pos] == regexpOpcodeLetterNot) {
|
||||||
|
displayElement = "\\L";
|
||||||
|
} else if (_data[pos] == regexpOpcodeSpaceNot) {
|
||||||
|
displayElement = "\\S";
|
||||||
|
} else if (_data[pos] == regexpOpcodeWordNot) {
|
||||||
|
displayElement = "\\W";
|
||||||
|
} else {
|
||||||
|
displayElement = (char)_data[pos];
|
||||||
|
}
|
||||||
|
TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << displayElement << "'");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
pos++;
|
pos++;
|
||||||
|
@ -98,16 +98,15 @@ normal mode :
|
|||||||
\W NOT a "Word" character [^a-zA-Z0-9_]
|
\W NOT a "Word" character [^a-zA-Z0-9_]
|
||||||
\@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...)
|
\@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...)
|
||||||
\e end-of-file / end-of-data [\x00] ==> not counted
|
\e end-of-file / end-of-data [\x00] ==> not counted
|
||||||
[anjdi] or [a-gt-j] range
|
[anjdi] or [a-gt-j] range: It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected
|
||||||
. dot [^\x00]
|
. dot [^\x00]
|
||||||
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
|
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
|
||||||
@ Previous
|
@ Previous
|
||||||
==> TODO :
|
==> TODO :
|
||||||
^in the [] invertion of the range element
|
|
||||||
Sart of line
|
Sart of line
|
||||||
force regexp to be the shortest.
|
force regexp to be the shortest.
|
||||||
|
|
||||||
multiplicity :
|
multiplicity:
|
||||||
* ==> {0, 2147483647} (try to have the minimum size)
|
* ==> {0, 2147483647} (try to have the minimum size)
|
||||||
? ==> {0, 1}
|
? ==> {0, 1}
|
||||||
+ ==> {1, 2147483647} (try to have the minimum size)
|
+ ==> {1, 2147483647} (try to have the minimum size)
|
||||||
@ -202,7 +201,7 @@ class FindProperty {
|
|||||||
void setPositionStop(int64_t _newPos) {
|
void setPositionStop(int64_t _newPos) {
|
||||||
m_positionStop = _newPos;
|
m_positionStop = _newPos;
|
||||||
if (m_positionStop < m_positionStart) {
|
if (m_positionStop < m_positionStart) {
|
||||||
TK_CRITICAL("set volontary a stop position before end : " << this);
|
TK_CRITICAL("set volontary a stop position before end : " << this << " start=" << m_positionStart << " stop=" << m_positionStop);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint32_t getMultiplicity() const {
|
uint32_t getMultiplicity() const {
|
||||||
@ -638,9 +637,32 @@ template<class CLASS_TYPE> class NodeBracket : public NodeRangeValue<CLASS_TYPE>
|
|||||||
|
|
||||||
char32_t lastElement = 0;
|
char32_t lastElement = 0;
|
||||||
bool multipleElement = false;
|
bool multipleElement = false;
|
||||||
//
|
// Parse the elements:
|
||||||
for (int32_t kkk=0; kkk<(int64_t)Node<CLASS_TYPE>::m_regExpData.size(); kkk++) {
|
for (int32_t kkk=0; kkk<(int64_t)Node<CLASS_TYPE>::m_regExpData.size(); kkk++) {
|
||||||
if ( Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeTo
|
if ( kkk == 0
|
||||||
|
&& Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeStartOfLine) {
|
||||||
|
// Check if the user request an invertion check:
|
||||||
|
NodeRangeValue<CLASS_TYPE>::setInvertion(true);
|
||||||
|
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeStartOfLine) {
|
||||||
|
TK_ERROR("Unsupported Element '^' inside the [...] not at the first element");
|
||||||
|
return 0;
|
||||||
|
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeDigit) {
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addRange('0', '9');
|
||||||
|
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeLetter) {
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addRange('a', 'z');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addRange('A', 'Z');
|
||||||
|
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeSpace) {
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addValue(' ');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addValue('\t');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addValue('\n');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addValue('\r');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addValue('\f');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addValue('\v');
|
||||||
|
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeWord) {
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addRange('a', 'z');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addRange('A', 'Z');
|
||||||
|
NodeRangeValue<CLASS_TYPE>::addRange('0', '9');
|
||||||
|
} else if ( Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeTo
|
||||||
&& multipleElement == true) {
|
&& multipleElement == true) {
|
||||||
TK_ERROR("Can not have 2 consecutive - in [...]");
|
TK_ERROR("Can not have 2 consecutive - in [...]");
|
||||||
return 0;
|
return 0;
|
||||||
@ -938,7 +960,6 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
|
|||||||
m_subNode.push_back(tmpNode);
|
m_subNode.push_back(tmpNode);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default: {
|
default: {
|
||||||
elementSize = getLenOfNormal(Node<CLASS_TYPE>::m_regExpData, pos);
|
elementSize = getLenOfNormal(Node<CLASS_TYPE>::m_regExpData, pos);
|
||||||
for (int64_t kkk=pos; kkk<pos+elementSize; kkk++) {
|
for (int64_t kkk=pos; kkk<pos+elementSize; kkk++) {
|
||||||
@ -1303,7 +1324,7 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
|
|||||||
* \w "Word" character [a-zA-Z0-9_]
|
* \w "Word" character [a-zA-Z0-9_]
|
||||||
* \W NOT a "Word" character [^a-zA-Z0-9_]
|
* \W NOT a "Word" character [^a-zA-Z0-9_]
|
||||||
* \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...)
|
* \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...)
|
||||||
* [anjdi] or [a-gt-j] range
|
* [anjdi] or [a-gt-j] range. It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected
|
||||||
* . dot [^\x00-\x08\x0A-\x1F\x7F]
|
* . dot [^\x00-\x08\x0A-\x1F\x7F]
|
||||||
* ==> TODO :
|
* ==> TODO :
|
||||||
* $ End / Start of line of line ==> ce sera un truc suplé comme le \@
|
* $ End / Start of line of line ==> ce sera un truc suplé comme le \@
|
||||||
@ -1382,6 +1403,7 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
* @brief Set a new regular expression matching
|
* @brief Set a new regular expression matching
|
||||||
* @param[in] _exp the new expression to search
|
* @param[in] _exp the new expression to search
|
||||||
*/
|
*/
|
||||||
|
// TODO : Add an error ...
|
||||||
void compile(const std::string &_exp) {
|
void compile(const std::string &_exp) {
|
||||||
if (_exp.size() != 0) {
|
if (_exp.size() != 0) {
|
||||||
TK_REG_DEBUG("normal string parse : '" << _exp << "'");
|
TK_REG_DEBUG("normal string parse : '" << _exp << "'");
|
||||||
@ -1491,22 +1513,20 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// need to check if all () [] and {} is well set ...
|
// need to check if all () [] and {} is well set ...
|
||||||
if (false == checkGoodPosition(tmpExp) ) {
|
if (checkGoodPosition(tmpExp) == false) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//TK_REG_DEBUG("Main element :" << createString(tmpExp) );
|
//TK_REG_DEBUG("Main element :" << createString(tmpExp) );
|
||||||
if ( tmpExp.size()>0
|
if ( tmpExp.size() > 0
|
||||||
&& tmpExp[0] == regexpOpcodeNoChar)
|
&& tmpExp[0] == regexpOpcodeNoChar) {
|
||||||
{
|
|
||||||
//TK_DEBUG("=> must not begin with char");
|
//TK_DEBUG("=> must not begin with char");
|
||||||
m_notBeginWithChar = true;
|
m_notBeginWithChar = true;
|
||||||
// remove element
|
// remove element
|
||||||
tmpExp.erase(tmpExp.begin());
|
tmpExp.erase(tmpExp.begin());
|
||||||
}
|
}
|
||||||
if ( tmpExp.size()>0
|
if ( tmpExp.size() > 0
|
||||||
&& tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar)
|
&& tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar) {
|
||||||
{
|
|
||||||
//TK_DEBUG("=> must not end with char");
|
//TK_DEBUG("=> must not end with char");
|
||||||
m_notEndWithChar = true;
|
m_notEndWithChar = true;
|
||||||
// remove element
|
// remove element
|
||||||
@ -1573,7 +1593,7 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
int64_t maxlen = _endPos-iii;
|
int64_t maxlen = _endPos-iii;
|
||||||
TK_REG_DEBUG("----------------------------------------------");
|
TK_REG_DEBUG("----------------------------------------------");
|
||||||
TK_REG_DEBUG("parse element : " << iii << " : '" << _SearchIn[iii] << "'");
|
TK_REG_DEBUG("parse element : " << iii << " : '" << _SearchIn[iii] << "'");
|
||||||
if (true == m_notBeginWithChar) {
|
if (m_notBeginWithChar == true) {
|
||||||
if (iii>0) {
|
if (iii>0) {
|
||||||
char32_t tmpVal = _SearchIn[iii-1];
|
char32_t tmpVal = _SearchIn[iii-1];
|
||||||
if( ( tmpVal >= 'a'
|
if( ( tmpVal >= 'a'
|
||||||
@ -1780,7 +1800,8 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
//TK_DEBUG(" ==> Find ELEMENT : ([{");
|
//TK_DEBUG(" ==> Find ELEMENT : ([{");
|
||||||
// case dependent :
|
// case dependent:
|
||||||
|
int32_t localOffset = 0;
|
||||||
if ( curentCode == regexpOpcodeBracketIn
|
if ( curentCode == regexpOpcodeBracketIn
|
||||||
|| curentCode == regexpOpcodeBracetIn) {
|
|| curentCode == regexpOpcodeBracetIn) {
|
||||||
while(_pos<(int64_t)_tmpExp.size()) {
|
while(_pos<(int64_t)_tmpExp.size()) {
|
||||||
@ -1790,7 +1811,7 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
// otherwise, we check the error in the element ...
|
// otherwise, we check the error in the element ...
|
||||||
char *find = NULL;
|
char *find = nullptr;
|
||||||
switch (_tmpExp[_pos]) {
|
switch (_tmpExp[_pos]) {
|
||||||
case regexpOpcodePTheseIn: find = (char*)"("; break;
|
case regexpOpcodePTheseIn: find = (char*)"("; break;
|
||||||
case regexpOpcodeBracketIn: find = (char*)"["; break;
|
case regexpOpcodeBracketIn: find = (char*)"["; break;
|
||||||
@ -1803,26 +1824,37 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
case regexpOpcodeQuestion: find = (char*)"?"; break;
|
case regexpOpcodeQuestion: find = (char*)"?"; break;
|
||||||
case regexpOpcodePlus: find = (char*)"+"; break;
|
case regexpOpcodePlus: find = (char*)"+"; break;
|
||||||
case regexpOpcodePipe: find = (char*)"|"; break;
|
case regexpOpcodePipe: find = (char*)"|"; break;
|
||||||
case regexpOpcodeStartOfLine: find = (char*)"^"; break;
|
|
||||||
case regexpOpcodeEndOfLine: find = (char*)"$"; break;
|
case regexpOpcodeEndOfLine: find = (char*)"$"; break;
|
||||||
case regexpOpcodeDigit: find = (char*)"\\d"; break;
|
|
||||||
case regexpOpcodeDigitNot: find = (char*)"\\D"; break;
|
case regexpOpcodeDigitNot: find = (char*)"\\D"; break;
|
||||||
case regexpOpcodeLetter: find = (char*)"\\l"; break;
|
|
||||||
case regexpOpcodeLetterNot: find = (char*)"\\L"; break;
|
case regexpOpcodeLetterNot: find = (char*)"\\L"; break;
|
||||||
case regexpOpcodeSpace: find = (char*)"\\s"; break;
|
|
||||||
case regexpOpcodeSpaceNot: find = (char*)"\\S"; break;
|
case regexpOpcodeSpaceNot: find = (char*)"\\S"; break;
|
||||||
case regexpOpcodeWord: find = (char*)"\\w"; break;
|
|
||||||
case regexpOpcodeWordNot: find = (char*)"\\W"; break;
|
case regexpOpcodeWordNot: find = (char*)"\\W"; break;
|
||||||
case regexpOpcodeNoChar: find = (char*)"\\@"; break;
|
case regexpOpcodeNoChar: find = (char*)"\\@"; break;
|
||||||
|
case regexpOpcodeStartOfLine:
|
||||||
|
if ( endCode == regexpOpcodeBracetOut
|
||||||
|
|| localOffset != 0) {
|
||||||
|
find = (char*)"^"; break;
|
||||||
|
}
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
if (NULL != find) {
|
// Specific element forbiden for (...) but not for [...]
|
||||||
|
if (endCode == regexpOpcodeBracetOut) {
|
||||||
|
switch (_tmpExp[_pos]) {
|
||||||
|
case regexpOpcodeDigit: find = (char*)"\\d"; break;
|
||||||
|
case regexpOpcodeLetter: find = (char*)"\\l"; break;
|
||||||
|
case regexpOpcodeSpace: find = (char*)"\\s"; break;
|
||||||
|
case regexpOpcodeWord: find = (char*)"\\w"; break;
|
||||||
|
default: break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (find != nullptr) {
|
||||||
(void)input;
|
(void)input;
|
||||||
TK_ERROR("can not have : '" << find << "' inside " << input << " element");
|
TK_ERROR("can not have : '" << find << "' inside " << input << " element");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_pos++;
|
_pos++;
|
||||||
|
localOffset++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
while(_pos< (int64_t)_tmpExp.size()) {
|
while(_pos< (int64_t)_tmpExp.size()) {
|
||||||
@ -1839,7 +1871,7 @@ template<class CLASS_TYPE> class RegExp {
|
|||||||
if( _tmpExp[_pos] == regexpOpcodePTheseIn
|
if( _tmpExp[_pos] == regexpOpcodePTheseIn
|
||||||
|| _tmpExp[_pos] == regexpOpcodeBracketIn
|
|| _tmpExp[_pos] == regexpOpcodeBracketIn
|
||||||
|| _tmpExp[_pos] == regexpOpcodeBracetIn ) {
|
|| _tmpExp[_pos] == regexpOpcodeBracetIn ) {
|
||||||
if (false==checkGoodPosition(_tmpExp, _pos) ) {
|
if (checkGoodPosition(_tmpExp, _pos) == false ) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user