[DEV] add the ^ \w \d \l \s in the [...] element

This commit is contained in:
Edouard DUPIN 2017-03-14 21:07:15 +01:00
parent bafe283402
commit 07470e11b9
3 changed files with 83 additions and 30 deletions

View File

@ -7,7 +7,8 @@
*/ */
#pragma once #pragma once
#include <etk/os/FSNode.h> #include <etk/os/FSNode.hpp>
#include <etk/debug.hpp>
// minimum gapSize when allocated // minimum gapSize when allocated
#define GAP_SIZE_MIN (80) #define GAP_SIZE_MIN (80)

View File

@ -223,7 +223,7 @@ int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t
pos++; pos++;
// find size ... // find size ...
while (pos < (int64_t)_data.size() ) { while (pos < (int64_t)_data.size() ) {
if(_data[pos]==regexpOpcodeBracketOut) { if(_data[pos] == regexpOpcodeBracketOut) {
// Find the end of the [...] // Find the end of the [...]
// just return the size inside // just return the size inside
int32_t sizeInside = pos - _startPos -1 ; int32_t sizeInside = pos - _startPos -1 ;
@ -232,9 +232,29 @@ int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t
return 0; return 0;
} }
return sizeInside; return sizeInside;
} else if( _data[pos] != regexpOpcodeTo } else if ( _data[pos] == regexpOpcodeStartOfLine
&& _data[pos] > 0xFF ) { || _data[pos] == regexpOpcodeDigit
TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << (char)_data[pos] << "'"); || _data[pos] == regexpOpcodeLetter
|| _data[pos] == regexpOpcodeSpace
|| _data[pos] == regexpOpcodeWord
|| _data[pos] == regexpOpcodeTo) {
// nothing to do ... it is permited
} else if(_data[pos] > 0xFF ) {
std::string displayElement;
if (_data[pos] == regexpOpcodeStartOfLine) {
displayElement = "^";
} else if (_data[pos] == regexpOpcodeDigitNot) {
displayElement = "\\D";
} else if (_data[pos] == regexpOpcodeLetterNot) {
displayElement = "\\L";
} else if (_data[pos] == regexpOpcodeSpaceNot) {
displayElement = "\\S";
} else if (_data[pos] == regexpOpcodeWordNot) {
displayElement = "\\W";
} else {
displayElement = (char)_data[pos];
}
TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << displayElement << "'");
return 0; return 0;
} }
pos++; pos++;

View File

@ -98,16 +98,15 @@ normal mode :
\W NOT a "Word" character [^a-zA-Z0-9_] \W NOT a "Word" character [^a-zA-Z0-9_]
\@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...) \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...)
\e end-of-file / end-of-data [\x00] ==> not counted \e end-of-file / end-of-data [\x00] ==> not counted
[anjdi] or [a-gt-j] range [anjdi] or [a-gt-j] range: It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected
. dot [^\x00] . dot [^\x00]
$ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@ $ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@
@ Previous @ Previous
==> TODO : ==> TODO :
^in the [] invertion of the range element
Sart of line Sart of line
force regexp to be the shortest. force regexp to be the shortest.
multiplicity : multiplicity:
* ==> {0, 2147483647} (try to have the minimum size) * ==> {0, 2147483647} (try to have the minimum size)
? ==> {0, 1} ? ==> {0, 1}
+ ==> {1, 2147483647} (try to have the minimum size) + ==> {1, 2147483647} (try to have the minimum size)
@ -202,7 +201,7 @@ class FindProperty {
void setPositionStop(int64_t _newPos) { void setPositionStop(int64_t _newPos) {
m_positionStop = _newPos; m_positionStop = _newPos;
if (m_positionStop < m_positionStart) { if (m_positionStop < m_positionStart) {
TK_CRITICAL("set volontary a stop position before end : " << this); TK_CRITICAL("set volontary a stop position before end : " << this << " start=" << m_positionStart << " stop=" << m_positionStop);
} }
} }
uint32_t getMultiplicity() const { uint32_t getMultiplicity() const {
@ -638,9 +637,32 @@ template<class CLASS_TYPE> class NodeBracket : public NodeRangeValue<CLASS_TYPE>
char32_t lastElement = 0; char32_t lastElement = 0;
bool multipleElement = false; bool multipleElement = false;
// // Parse the elements:
for (int32_t kkk=0; kkk<(int64_t)Node<CLASS_TYPE>::m_regExpData.size(); kkk++) { for (int32_t kkk=0; kkk<(int64_t)Node<CLASS_TYPE>::m_regExpData.size(); kkk++) {
if ( Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeTo if ( kkk == 0
&& Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeStartOfLine) {
// Check if the user request an invertion check:
NodeRangeValue<CLASS_TYPE>::setInvertion(true);
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeStartOfLine) {
TK_ERROR("Unsupported Element '^' inside the [...] not at the first element");
return 0;
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeDigit) {
NodeRangeValue<CLASS_TYPE>::addRange('0', '9');
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeLetter) {
NodeRangeValue<CLASS_TYPE>::addRange('a', 'z');
NodeRangeValue<CLASS_TYPE>::addRange('A', 'Z');
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeSpace) {
NodeRangeValue<CLASS_TYPE>::addValue(' ');
NodeRangeValue<CLASS_TYPE>::addValue('\t');
NodeRangeValue<CLASS_TYPE>::addValue('\n');
NodeRangeValue<CLASS_TYPE>::addValue('\r');
NodeRangeValue<CLASS_TYPE>::addValue('\f');
NodeRangeValue<CLASS_TYPE>::addValue('\v');
} else if (Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeWord) {
NodeRangeValue<CLASS_TYPE>::addRange('a', 'z');
NodeRangeValue<CLASS_TYPE>::addRange('A', 'Z');
NodeRangeValue<CLASS_TYPE>::addRange('0', '9');
} else if ( Node<CLASS_TYPE>::m_regExpData[kkk] == regexpOpcodeTo
&& multipleElement == true) { && multipleElement == true) {
TK_ERROR("Can not have 2 consecutive - in [...]"); TK_ERROR("Can not have 2 consecutive - in [...]");
return 0; return 0;
@ -938,7 +960,6 @@ template<class CLASS_TYPE> class NodePTheseElem : public Node<CLASS_TYPE> {
m_subNode.push_back(tmpNode); m_subNode.push_back(tmpNode);
} }
break; break;
default: { default: {
elementSize = getLenOfNormal(Node<CLASS_TYPE>::m_regExpData, pos); elementSize = getLenOfNormal(Node<CLASS_TYPE>::m_regExpData, pos);
for (int64_t kkk=pos; kkk<pos+elementSize; kkk++) { for (int64_t kkk=pos; kkk<pos+elementSize; kkk++) {
@ -1303,7 +1324,7 @@ template<class CLASS_TYPE> class NodePThese : public Node<CLASS_TYPE> {
* \w "Word" character [a-zA-Z0-9_] * \w "Word" character [a-zA-Z0-9_]
* \W NOT a "Word" character [^a-zA-Z0-9_] * \W NOT a "Word" character [^a-zA-Z0-9_]
* \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...) * \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...)
* [anjdi] or [a-gt-j] range * [anjdi] or [a-gt-j] range. It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected
* . dot [^\x00-\x08\x0A-\x1F\x7F] * . dot [^\x00-\x08\x0A-\x1F\x7F]
* ==> TODO : * ==> TODO :
* $ End / Start of line of line ==> ce sera un truc suplé comme le \@ * $ End / Start of line of line ==> ce sera un truc suplé comme le \@
@ -1382,6 +1403,7 @@ template<class CLASS_TYPE> class RegExp {
* @brief Set a new regular expression matching * @brief Set a new regular expression matching
* @param[in] _exp the new expression to search * @param[in] _exp the new expression to search
*/ */
// TODO : Add an error ...
void compile(const std::string &_exp) { void compile(const std::string &_exp) {
if (_exp.size() != 0) { if (_exp.size() != 0) {
TK_REG_DEBUG("normal string parse : '" << _exp << "'"); TK_REG_DEBUG("normal string parse : '" << _exp << "'");
@ -1491,22 +1513,20 @@ template<class CLASS_TYPE> class RegExp {
return; return;
} }
// need to check if all () [] and {} is well set ... // need to check if all () [] and {} is well set ...
if (false == checkGoodPosition(tmpExp) ) { if (checkGoodPosition(tmpExp) == false) {
return; return;
} }
//TK_REG_DEBUG("Main element :" << createString(tmpExp) ); //TK_REG_DEBUG("Main element :" << createString(tmpExp) );
if ( tmpExp.size()>0 if ( tmpExp.size() > 0
&& tmpExp[0] == regexpOpcodeNoChar) && tmpExp[0] == regexpOpcodeNoChar) {
{
//TK_DEBUG("=> must not begin with char"); //TK_DEBUG("=> must not begin with char");
m_notBeginWithChar = true; m_notBeginWithChar = true;
// remove element // remove element
tmpExp.erase(tmpExp.begin()); tmpExp.erase(tmpExp.begin());
} }
if ( tmpExp.size()>0 if ( tmpExp.size() > 0
&& tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar) && tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar) {
{
//TK_DEBUG("=> must not end with char"); //TK_DEBUG("=> must not end with char");
m_notEndWithChar = true; m_notEndWithChar = true;
// remove element // remove element
@ -1573,7 +1593,7 @@ template<class CLASS_TYPE> class RegExp {
int64_t maxlen = _endPos-iii; int64_t maxlen = _endPos-iii;
TK_REG_DEBUG("----------------------------------------------"); TK_REG_DEBUG("----------------------------------------------");
TK_REG_DEBUG("parse element : " << iii << " : '" << _SearchIn[iii] << "'"); TK_REG_DEBUG("parse element : " << iii << " : '" << _SearchIn[iii] << "'");
if (true == m_notBeginWithChar) { if (m_notBeginWithChar == true) {
if (iii>0) { if (iii>0) {
char32_t tmpVal = _SearchIn[iii-1]; char32_t tmpVal = _SearchIn[iii-1];
if( ( tmpVal >= 'a' if( ( tmpVal >= 'a'
@ -1780,7 +1800,8 @@ template<class CLASS_TYPE> class RegExp {
return false; return false;
} }
//TK_DEBUG(" ==> Find ELEMENT : ([{"); //TK_DEBUG(" ==> Find ELEMENT : ([{");
// case dependent : // case dependent:
int32_t localOffset = 0;
if ( curentCode == regexpOpcodeBracketIn if ( curentCode == regexpOpcodeBracketIn
|| curentCode == regexpOpcodeBracetIn) { || curentCode == regexpOpcodeBracetIn) {
while(_pos<(int64_t)_tmpExp.size()) { while(_pos<(int64_t)_tmpExp.size()) {
@ -1790,7 +1811,7 @@ template<class CLASS_TYPE> class RegExp {
return true; return true;
} else { } else {
// otherwise, we check the error in the element ... // otherwise, we check the error in the element ...
char *find = NULL; char *find = nullptr;
switch (_tmpExp[_pos]) { switch (_tmpExp[_pos]) {
case regexpOpcodePTheseIn: find = (char*)"("; break; case regexpOpcodePTheseIn: find = (char*)"("; break;
case regexpOpcodeBracketIn: find = (char*)"["; break; case regexpOpcodeBracketIn: find = (char*)"["; break;
@ -1803,26 +1824,37 @@ template<class CLASS_TYPE> class RegExp {
case regexpOpcodeQuestion: find = (char*)"?"; break; case regexpOpcodeQuestion: find = (char*)"?"; break;
case regexpOpcodePlus: find = (char*)"+"; break; case regexpOpcodePlus: find = (char*)"+"; break;
case regexpOpcodePipe: find = (char*)"|"; break; case regexpOpcodePipe: find = (char*)"|"; break;
case regexpOpcodeStartOfLine: find = (char*)"^"; break;
case regexpOpcodeEndOfLine: find = (char*)"$"; break; case regexpOpcodeEndOfLine: find = (char*)"$"; break;
case regexpOpcodeDigit: find = (char*)"\\d"; break;
case regexpOpcodeDigitNot: find = (char*)"\\D"; break; case regexpOpcodeDigitNot: find = (char*)"\\D"; break;
case regexpOpcodeLetter: find = (char*)"\\l"; break;
case regexpOpcodeLetterNot: find = (char*)"\\L"; break; case regexpOpcodeLetterNot: find = (char*)"\\L"; break;
case regexpOpcodeSpace: find = (char*)"\\s"; break;
case regexpOpcodeSpaceNot: find = (char*)"\\S"; break; case regexpOpcodeSpaceNot: find = (char*)"\\S"; break;
case regexpOpcodeWord: find = (char*)"\\w"; break;
case regexpOpcodeWordNot: find = (char*)"\\W"; break; case regexpOpcodeWordNot: find = (char*)"\\W"; break;
case regexpOpcodeNoChar: find = (char*)"\\@"; break; case regexpOpcodeNoChar: find = (char*)"\\@"; break;
case regexpOpcodeStartOfLine:
if ( endCode == regexpOpcodeBracetOut
|| localOffset != 0) {
find = (char*)"^"; break;
}
default: break; default: break;
} }
if (NULL != find) { // Specific element forbiden for (...) but not for [...]
if (endCode == regexpOpcodeBracetOut) {
switch (_tmpExp[_pos]) {
case regexpOpcodeDigit: find = (char*)"\\d"; break;
case regexpOpcodeLetter: find = (char*)"\\l"; break;
case regexpOpcodeSpace: find = (char*)"\\s"; break;
case regexpOpcodeWord: find = (char*)"\\w"; break;
default: break;
}
}
if (find != nullptr) {
(void)input; (void)input;
TK_ERROR("can not have : '" << find << "' inside " << input << " element"); TK_ERROR("can not have : '" << find << "' inside " << input << " element");
return false; return false;
} }
} }
_pos++; _pos++;
localOffset++;
} }
} else { } else {
while(_pos< (int64_t)_tmpExp.size()) { while(_pos< (int64_t)_tmpExp.size()) {
@ -1839,7 +1871,7 @@ template<class CLASS_TYPE> class RegExp {
if( _tmpExp[_pos] == regexpOpcodePTheseIn if( _tmpExp[_pos] == regexpOpcodePTheseIn
|| _tmpExp[_pos] == regexpOpcodeBracketIn || _tmpExp[_pos] == regexpOpcodeBracketIn
|| _tmpExp[_pos] == regexpOpcodeBracetIn ) { || _tmpExp[_pos] == regexpOpcodeBracetIn ) {
if (false==checkGoodPosition(_tmpExp, _pos) ) { if (checkGoodPosition(_tmpExp, _pos) == false ) {
return false; return false;
} }
} }