From 07470e11b9fd002a7489b9fe019670eb8a9eccfe Mon Sep 17 00:00:00 2001 From: Edouard DUPIN Date: Tue, 14 Mar 2017 21:07:15 +0100 Subject: [PATCH] [DEV] add the ^ \w \d \l \s in the [...] element --- etk/Buffer.hpp | 3 +- etk/RegExp.cpp | 28 ++++++++++++++--- etk/RegExp.hpp | 82 +++++++++++++++++++++++++++++++++++--------------- 3 files changed, 83 insertions(+), 30 deletions(-) diff --git a/etk/Buffer.hpp b/etk/Buffer.hpp index d3ab671..a9f8964 100644 --- a/etk/Buffer.hpp +++ b/etk/Buffer.hpp @@ -7,7 +7,8 @@ */ #pragma once -#include +#include +#include // minimum gapSize when allocated #define GAP_SIZE_MIN (80) diff --git a/etk/RegExp.cpp b/etk/RegExp.cpp index f32f224..37c8865 100644 --- a/etk/RegExp.cpp +++ b/etk/RegExp.cpp @@ -223,7 +223,7 @@ int64_t etk::regexp::getLenOfBracket(const std::vector& _data, int64_t pos++; // find size ... while (pos < (int64_t)_data.size() ) { - if(_data[pos]==regexpOpcodeBracketOut) { + if(_data[pos] == regexpOpcodeBracketOut) { // Find the end of the [...] // just return the size inside int32_t sizeInside = pos - _startPos -1 ; @@ -232,9 +232,29 @@ int64_t etk::regexp::getLenOfBracket(const std::vector& _data, int64_t return 0; } return sizeInside; - } else if( _data[pos] != regexpOpcodeTo - && _data[pos] > 0xFF ) { - TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << (char)_data[pos] << "'"); + } else if ( _data[pos] == regexpOpcodeStartOfLine + || _data[pos] == regexpOpcodeDigit + || _data[pos] == regexpOpcodeLetter + || _data[pos] == regexpOpcodeSpace + || _data[pos] == regexpOpcodeWord + || _data[pos] == regexpOpcodeTo) { + // nothing to do ... it is permited + } else if(_data[pos] > 0xFF ) { + std::string displayElement; + if (_data[pos] == regexpOpcodeStartOfLine) { + displayElement = "^"; + } else if (_data[pos] == regexpOpcodeDigitNot) { + displayElement = "\\D"; + } else if (_data[pos] == regexpOpcodeLetterNot) { + displayElement = "\\L"; + } else if (_data[pos] == regexpOpcodeSpaceNot) { + displayElement = "\\S"; + } else if (_data[pos] == regexpOpcodeWordNot) { + displayElement = "\\W"; + } else { + displayElement = (char)_data[pos]; + } + TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << displayElement << "'"); return 0; } pos++; diff --git a/etk/RegExp.hpp b/etk/RegExp.hpp index 9a04eca..3dc292c 100644 --- a/etk/RegExp.hpp +++ b/etk/RegExp.hpp @@ -98,16 +98,15 @@ normal mode : \W NOT a "Word" character [^a-zA-Z0-9_] \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...) \e end-of-file / end-of-data [\x00] ==> not counted - [anjdi] or [a-gt-j] range + [anjdi] or [a-gt-j] range: It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected . dot [^\x00] $ End / Start of line of line ==> ce sera un truc suplémentaire comme le \@ @ Previous ==> TODO : - ^in the [] invertion of the range element Sart of line force regexp to be the shortest. -multiplicity : +multiplicity: * ==> {0, 2147483647} (try to have the minimum size) ? ==> {0, 1} + ==> {1, 2147483647} (try to have the minimum size) @@ -202,7 +201,7 @@ class FindProperty { void setPositionStop(int64_t _newPos) { m_positionStop = _newPos; if (m_positionStop < m_positionStart) { - TK_CRITICAL("set volontary a stop position before end : " << this); + TK_CRITICAL("set volontary a stop position before end : " << this << " start=" << m_positionStart << " stop=" << m_positionStop); } } uint32_t getMultiplicity() const { @@ -638,9 +637,32 @@ template class NodeBracket : public NodeRangeValue char32_t lastElement = 0; bool multipleElement = false; - // + // Parse the elements: for (int32_t kkk=0; kkk<(int64_t)Node::m_regExpData.size(); kkk++) { - if ( Node::m_regExpData[kkk] == regexpOpcodeTo + if ( kkk == 0 + && Node::m_regExpData[kkk] == regexpOpcodeStartOfLine) { + // Check if the user request an invertion check: + NodeRangeValue::setInvertion(true); + } else if (Node::m_regExpData[kkk] == regexpOpcodeStartOfLine) { + TK_ERROR("Unsupported Element '^' inside the [...] not at the first element"); + return 0; + } else if (Node::m_regExpData[kkk] == regexpOpcodeDigit) { + NodeRangeValue::addRange('0', '9'); + } else if (Node::m_regExpData[kkk] == regexpOpcodeLetter) { + NodeRangeValue::addRange('a', 'z'); + NodeRangeValue::addRange('A', 'Z'); + } else if (Node::m_regExpData[kkk] == regexpOpcodeSpace) { + NodeRangeValue::addValue(' '); + NodeRangeValue::addValue('\t'); + NodeRangeValue::addValue('\n'); + NodeRangeValue::addValue('\r'); + NodeRangeValue::addValue('\f'); + NodeRangeValue::addValue('\v'); + } else if (Node::m_regExpData[kkk] == regexpOpcodeWord) { + NodeRangeValue::addRange('a', 'z'); + NodeRangeValue::addRange('A', 'Z'); + NodeRangeValue::addRange('0', '9'); + } else if ( Node::m_regExpData[kkk] == regexpOpcodeTo && multipleElement == true) { TK_ERROR("Can not have 2 consecutive - in [...]"); return 0; @@ -938,7 +960,6 @@ template class NodePTheseElem : public Node { m_subNode.push_back(tmpNode); } break; - default: { elementSize = getLenOfNormal(Node::m_regExpData, pos); for (int64_t kkk=pos; kkk class NodePThese : public Node { * \w "Word" character [a-zA-Z0-9_] * \W NOT a "Word" character [^a-zA-Z0-9_] * \@ at the start or the end not in the parsing of element ==> check if \w is not present (other regExp will be <> ...) - * [anjdi] or [a-gt-j] range + * [anjdi] or [a-gt-j] range. It support the \d \w \s \l elements. If you add at the first element a '^' it will invert the value selected * . dot [^\x00-\x08\x0A-\x1F\x7F] * ==> TODO : * $ End / Start of line of line ==> ce sera un truc suplé comme le \@ @@ -1382,6 +1403,7 @@ template class RegExp { * @brief Set a new regular expression matching * @param[in] _exp the new expression to search */ + // TODO : Add an error ... void compile(const std::string &_exp) { if (_exp.size() != 0) { TK_REG_DEBUG("normal string parse : '" << _exp << "'"); @@ -1491,22 +1513,20 @@ template class RegExp { return; } // need to check if all () [] and {} is well set ... - if (false == checkGoodPosition(tmpExp) ) { + if (checkGoodPosition(tmpExp) == false) { return; } //TK_REG_DEBUG("Main element :" << createString(tmpExp) ); - if ( tmpExp.size()>0 - && tmpExp[0] == regexpOpcodeNoChar) - { + if ( tmpExp.size() > 0 + && tmpExp[0] == regexpOpcodeNoChar) { //TK_DEBUG("=> must not begin with char"); m_notBeginWithChar = true; // remove element tmpExp.erase(tmpExp.begin()); } - if ( tmpExp.size()>0 - && tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar) - { + if ( tmpExp.size() > 0 + && tmpExp[tmpExp.size()-1] == regexpOpcodeNoChar) { //TK_DEBUG("=> must not end with char"); m_notEndWithChar = true; // remove element @@ -1573,7 +1593,7 @@ template class RegExp { int64_t maxlen = _endPos-iii; TK_REG_DEBUG("----------------------------------------------"); TK_REG_DEBUG("parse element : " << iii << " : '" << _SearchIn[iii] << "'"); - if (true == m_notBeginWithChar) { + if (m_notBeginWithChar == true) { if (iii>0) { char32_t tmpVal = _SearchIn[iii-1]; if( ( tmpVal >= 'a' @@ -1780,7 +1800,8 @@ template class RegExp { return false; } //TK_DEBUG(" ==> Find ELEMENT : ([{"); - // case dependent : + // case dependent: + int32_t localOffset = 0; if ( curentCode == regexpOpcodeBracketIn || curentCode == regexpOpcodeBracetIn) { while(_pos<(int64_t)_tmpExp.size()) { @@ -1790,7 +1811,7 @@ template class RegExp { return true; } else { // otherwise, we check the error in the element ... - char *find = NULL; + char *find = nullptr; switch (_tmpExp[_pos]) { case regexpOpcodePTheseIn: find = (char*)"("; break; case regexpOpcodeBracketIn: find = (char*)"["; break; @@ -1803,26 +1824,37 @@ template class RegExp { case regexpOpcodeQuestion: find = (char*)"?"; break; case regexpOpcodePlus: find = (char*)"+"; break; case regexpOpcodePipe: find = (char*)"|"; break; - case regexpOpcodeStartOfLine: find = (char*)"^"; break; case regexpOpcodeEndOfLine: find = (char*)"$"; break; - case regexpOpcodeDigit: find = (char*)"\\d"; break; case regexpOpcodeDigitNot: find = (char*)"\\D"; break; - case regexpOpcodeLetter: find = (char*)"\\l"; break; case regexpOpcodeLetterNot: find = (char*)"\\L"; break; - case regexpOpcodeSpace: find = (char*)"\\s"; break; case regexpOpcodeSpaceNot: find = (char*)"\\S"; break; - case regexpOpcodeWord: find = (char*)"\\w"; break; case regexpOpcodeWordNot: find = (char*)"\\W"; break; case regexpOpcodeNoChar: find = (char*)"\\@"; break; + case regexpOpcodeStartOfLine: + if ( endCode == regexpOpcodeBracetOut + || localOffset != 0) { + find = (char*)"^"; break; + } default: break; } - if (NULL != find) { + // Specific element forbiden for (...) but not for [...] + if (endCode == regexpOpcodeBracetOut) { + switch (_tmpExp[_pos]) { + case regexpOpcodeDigit: find = (char*)"\\d"; break; + case regexpOpcodeLetter: find = (char*)"\\l"; break; + case regexpOpcodeSpace: find = (char*)"\\s"; break; + case regexpOpcodeWord: find = (char*)"\\w"; break; + default: break; + } + } + if (find != nullptr) { (void)input; TK_ERROR("can not have : '" << find << "' inside " << input << " element"); return false; } } _pos++; + localOffset++; } } else { while(_pos< (int64_t)_tmpExp.size()) { @@ -1839,7 +1871,7 @@ template class RegExp { if( _tmpExp[_pos] == regexpOpcodePTheseIn || _tmpExp[_pos] == regexpOpcodeBracketIn || _tmpExp[_pos] == regexpOpcodeBracetIn ) { - if (false==checkGoodPosition(_tmpExp, _pos) ) { + if (checkGoodPosition(_tmpExp, _pos) == false ) { return false; } }