373 lines
13 KiB
C++
373 lines
13 KiB
C++
/**
|
|
* @author Edouard DUPIN
|
|
*
|
|
* @copyright 2011, Edouard DUPIN, all right reserved
|
|
*
|
|
* @license BSD v3 (see license file)
|
|
*/
|
|
|
|
|
|
#include <etk/RegExp.h>
|
|
|
|
|
|
|
|
const struct etk::convertionTable etk::regexp::constConvertionTable[] = {
|
|
// haveBackSlash, inputValue, newValue
|
|
{ false , '(' , 0 , etk::regexpOpcodePTheseIn},
|
|
{ true , '(' , '(' , etk::regexpOpcodeError},
|
|
{ false , ')' , 0 , etk::regexpOpcodePTheseOut},
|
|
{ true , ')' , ')' , etk::regexpOpcodeError},
|
|
{ false , '[' , 0 , etk::regexpOpcodeBracketIn},
|
|
{ true , '[' , '[' , etk::regexpOpcodeError},
|
|
{ false , ']' , 0 , etk::regexpOpcodeBracketOut},
|
|
{ true , ']' , ']' , etk::regexpOpcodeError},
|
|
{ false , '{' , 0 , etk::regexpOpcodeBracetIn},
|
|
{ true , '{' , '{' , etk::regexpOpcodeError},
|
|
{ false , '}' , 0 , etk::regexpOpcodeBracetOut},
|
|
{ true , '}' , '}' , etk::regexpOpcodeError},
|
|
{ false , '-' , 0 , etk::regexpOpcodeTo},
|
|
{ true , '-' , '-' , etk::regexpOpcodeError},
|
|
{ false , '*' , 0 , etk::regexpOpcodeStar},
|
|
{ true , '*' , '*' , etk::regexpOpcodeError},
|
|
{ false , '.' , 0 , etk::regexpOpcodeDot},
|
|
{ true , '.' , '.' , etk::regexpOpcodeError},
|
|
{ false , '?' , 0 , etk::regexpOpcodeQuestion},
|
|
{ true , '?' , '?' , etk::regexpOpcodeError},
|
|
{ false , '+' , 0 , etk::regexpOpcodePlus},
|
|
{ true , '+' , '+' , etk::regexpOpcodeError},
|
|
{ false , '|' , 0 , etk::regexpOpcodePipe},
|
|
{ true , '|' , '|' , etk::regexpOpcodeError},
|
|
{ false , '^' , 0 , etk::regexpOpcodeStartOfLine},
|
|
{ true , '^' , '^' , etk::regexpOpcodeError},
|
|
{ false , '$' , 0 , etk::regexpOpcodeEndOfLine},
|
|
{ true , '$' , '$' , etk::regexpOpcodeError},
|
|
{ true , 'd' , 0 , etk::regexpOpcodeDigit},
|
|
{ true , 'D' , 0 , etk::regexpOpcodeDigitNot},
|
|
{ true , 'l' , 0 , etk::regexpOpcodeLetter},
|
|
{ true , 'L' , 0 , etk::regexpOpcodeLetterNot},
|
|
{ true , 's' , 0 , etk::regexpOpcodeSpace},
|
|
{ true , 'S' , 0 , etk::regexpOpcodeSpaceNot},
|
|
{ true , 'w' , 0 , etk::regexpOpcodeWord},
|
|
{ true , 'W' , 0 , etk::regexpOpcodeWordNot},
|
|
{ true , 'a' , '\a', etk::regexpOpcodeError},
|
|
{ true , 'b' , '\b', etk::regexpOpcodeError},
|
|
{ true , 'e' , 0x1B, etk::regexpOpcodeError}, // Escape character <Esc>
|
|
{ true , 'f' , '\f', etk::regexpOpcodeError},
|
|
{ true , 'n' , '\n', etk::regexpOpcodeError},
|
|
{ true , 'r' , '\r', etk::regexpOpcodeError},
|
|
{ true , 't' , '\t', etk::regexpOpcodeError},
|
|
{ true , 'v' , '\v', etk::regexpOpcodeError},
|
|
{ true , '\\' , '\\', etk::regexpOpcodeError},
|
|
{ true , '&' , '&' , etk::regexpOpcodeError},
|
|
{ true , '0' , '\0', etk::regexpOpcodeError},
|
|
{ true , '@' , 0 , etk::regexpOpcodeNoChar},
|
|
};
|
|
const int64_t etk::regexp::constConvertionTableSize = sizeof(etk::regexp::constConvertionTable) / sizeof(struct etk::convertionTable) ;
|
|
|
|
|
|
std::string etk::regexp::createString(const std::vector<char32_t>& _data, int64_t _start, int64_t _stop) {
|
|
std::string output(ETK_BASH_COLOR_NORMAL);
|
|
for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) {
|
|
switch(_data[iii]) {
|
|
case regexpOpcodePTheseIn: output += std::string(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodePTheseOut: output += std::string(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeBracketIn: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeBracketOut: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeTo: output += std::string(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeBracetIn: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeBracetOut: output += std::string(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeStar: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeDot: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeQuestion: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodePlus: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodePipe: output += std::string(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeNoChar: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeStartOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeEndOfLine: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeDigit: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeDigitNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeLetter: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeLetterNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeSpace: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeSpaceNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeWord: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
|
|
case regexpOpcodeWordNot: output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
|
|
case '\n': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
|
|
case '\t': output += std::string(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
|
|
default:
|
|
char plop[10];
|
|
int8_t nb = u32char::convertUtf8(_data[iii], plop);
|
|
plop[nb] = '\0';
|
|
output += plop;
|
|
break;
|
|
}
|
|
}
|
|
return output;
|
|
}
|
|
|
|
char* etk::regexp::levelSpace(uint32_t _level) {
|
|
static char* tmpSpace = " ";
|
|
if (_level>30) {
|
|
return tmpSpace;
|
|
}
|
|
return tmpSpace + 60 - 2*_level;
|
|
}
|
|
|
|
|
|
int64_t etk::regexp::getLenOfPTheseElem(const std::vector<char32_t>& _data, int64_t _startPos) {
|
|
if (_startPos>=(int64_t)_data.size()){
|
|
return 0;
|
|
}
|
|
int64_t pos = _startPos;
|
|
int32_t nbOpen = 0;
|
|
// special case of the (...) or | ==> we search '|' or ')'
|
|
if( _data[pos] == regexpOpcodePTheseOut
|
|
|| _data[pos] == regexpOpcodePipe) {
|
|
return 0;
|
|
}
|
|
// find size ...
|
|
while (pos < (int64_t)_data.size() ) {
|
|
if(_data[pos] == regexpOpcodePTheseIn) {
|
|
// find a sub section :
|
|
nbOpen++;
|
|
} else if(0 < nbOpen) {
|
|
if (_data[pos] == regexpOpcodePTheseOut) {
|
|
nbOpen--;
|
|
if (0 > nbOpen) {
|
|
TK_ERROR("Error in the (...) find element at "<< pos);
|
|
return -1;
|
|
}
|
|
}
|
|
} else if( _data[pos] == regexpOpcodePTheseOut
|
|
|| _data[pos] == regexpOpcodePipe) {
|
|
// Find the end of the (...)
|
|
// just return the size inside
|
|
int32_t sizeInside = pos - _startPos;
|
|
if (0 >= sizeInside) {
|
|
TK_ERROR("Error in the (...) no data at "<< pos-1);
|
|
return 0;
|
|
}
|
|
return sizeInside;
|
|
}
|
|
pos++;
|
|
}
|
|
return pos - _startPos;
|
|
}
|
|
|
|
int64_t etk::regexp::getLenOfPThese(const std::vector<char32_t>& _data, int64_t _startPos) {
|
|
int64_t pos = _startPos;
|
|
int32_t nbOpen = 0;
|
|
// special case of the (...) or | ==> we search '|' or ')'
|
|
if(_data[pos]==regexpOpcodePTheseOut) {
|
|
return 0;
|
|
}
|
|
if(_data[pos]!=regexpOpcodePTheseIn) {
|
|
TK_ERROR(" find error in PThese");
|
|
return 0;
|
|
}
|
|
pos++;
|
|
// find size ...
|
|
while (pos < (int64_t)_data.size() ) {
|
|
if(_data[pos]==regexpOpcodePTheseIn) {
|
|
// find a sub section :
|
|
nbOpen++;
|
|
} else if(0 < nbOpen) {
|
|
if (_data[pos]==regexpOpcodePTheseOut) {
|
|
nbOpen--;
|
|
if (0 > nbOpen) {
|
|
TK_ERROR("Error in the (...) find element at "<< pos);
|
|
return 0;
|
|
}
|
|
}
|
|
} else if(_data[pos]==regexpOpcodePTheseOut) {
|
|
// Find the end of the (...)
|
|
// just return the size inside
|
|
int32_t sizeInside = pos - _startPos-1;
|
|
if (0 >= sizeInside) {
|
|
TK_ERROR("Error in the (...) no data at "<< pos-1);
|
|
return 0;
|
|
}
|
|
return sizeInside;
|
|
}
|
|
pos++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int64_t etk::regexp::getLenOfBracket(const std::vector<char32_t>& _data, int64_t _startPos) {
|
|
int64_t pos = _startPos;
|
|
// special case of the (...) or | ==> we search '|' or ')'
|
|
if(_data[pos]==regexpOpcodeBracketOut) {
|
|
return 0;
|
|
}
|
|
if(_data[pos] != regexpOpcodeBracketIn) {
|
|
TK_ERROR("find no {...");
|
|
return 0;
|
|
}
|
|
pos++;
|
|
// find size ...
|
|
while (pos < (int64_t)_data.size() ) {
|
|
if(_data[pos]==regexpOpcodeBracketOut) {
|
|
// Find the end of the [...]
|
|
// just return the size inside
|
|
int32_t sizeInside = pos - _startPos -1 ;
|
|
if (0 >= sizeInside) {
|
|
TK_ERROR("Error in the [...] no data at "<< pos-1);
|
|
return 0;
|
|
}
|
|
return sizeInside;
|
|
} else if( _data[pos] != regexpOpcodeTo
|
|
&& _data[pos] > 0xFF ) {
|
|
TK_ERROR("Error in the [...] not permited element at "<< pos << " '" << (char)_data[pos] << "'");
|
|
return 0;
|
|
}
|
|
pos++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int64_t etk::regexp::getLenOfBrace(const std::vector<char32_t>& _data, int64_t _startPos) {
|
|
int32_t pos = _startPos;
|
|
// special case of the (...) or | ==> we search '|' or ')'
|
|
if(_data[pos]==regexpOpcodeBracetOut) {
|
|
return 0;
|
|
}
|
|
if(_data[pos]!=regexpOpcodeBracetIn) {
|
|
TK_ERROR(" did not find brace IN { ");
|
|
return 0;
|
|
}
|
|
pos++;
|
|
// find size ...
|
|
while (pos < (int64_t)_data.size() ) {
|
|
if(_data[pos]==regexpOpcodeBracetOut) {
|
|
// Find the end of the [...]
|
|
// just return the size inside
|
|
int32_t sizeInside = pos - _startPos -1 ;
|
|
if (0 >= sizeInside) {
|
|
TK_ERROR("Error in the {...} no data at "<< pos-1);
|
|
return 0;
|
|
}
|
|
return sizeInside;
|
|
} else if( _data[pos] != ','
|
|
&& ( _data[pos] < '0'
|
|
|| _data[pos] > '9') ) {
|
|
TK_ERROR("Error in the {...} not permited element at "<< pos << " '" << _data[pos] << "'");
|
|
return 0;
|
|
}
|
|
pos++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int64_t etk::regexp::getLenOfNormal(const std::vector<char32_t>& _data, int64_t _startPos) {
|
|
int64_t pos = _startPos;
|
|
// find size ...
|
|
while (pos < (int64_t)_data.size() ) {
|
|
switch(_data[pos]) {
|
|
case regexpOpcodePTheseIn:
|
|
case regexpOpcodePTheseOut:
|
|
case regexpOpcodeBracketIn:
|
|
case regexpOpcodeBracketOut:
|
|
case regexpOpcodeBracetIn:
|
|
case regexpOpcodeBracetOut:
|
|
case regexpOpcodeTo:
|
|
case regexpOpcodeStar:
|
|
case regexpOpcodeDot:
|
|
case regexpOpcodeQuestion:
|
|
case regexpOpcodePlus:
|
|
case regexpOpcodePipe:
|
|
case regexpOpcodeStartOfLine:
|
|
case regexpOpcodeEndOfLine:
|
|
case regexpOpcodeDigit:
|
|
case regexpOpcodeDigitNot:
|
|
case regexpOpcodeLetter:
|
|
case regexpOpcodeLetterNot:
|
|
case regexpOpcodeSpace:
|
|
case regexpOpcodeSpaceNot:
|
|
case regexpOpcodeWord:
|
|
case regexpOpcodeWordNot:
|
|
{
|
|
// just return the size inside
|
|
int32_t sizeInside = pos - _startPos;
|
|
if (0 >= sizeInside) {
|
|
TK_ERROR("Error in the normal data : no data ...");
|
|
}
|
|
return sizeInside;
|
|
}
|
|
break;
|
|
default :
|
|
// nothing to do ...
|
|
break;
|
|
}
|
|
pos++;
|
|
}
|
|
if ((int64_t)pos - (int64_t)_startPos < 0) {
|
|
return 0;
|
|
}
|
|
return pos - _startPos ;
|
|
}
|
|
|
|
|
|
bool etk::regexp::parseBrace(const std::vector<char32_t>& _data, uint32_t& _min, uint32_t& _max)
|
|
{
|
|
//TK_INFO("parse {...} in "; DisplayElem(data); );
|
|
int64_t k=0;
|
|
|
|
int32_t firstElement = 0;
|
|
int32_t SecondElement = 0;
|
|
|
|
while(k < (int64_t)_data.size()) {
|
|
if (_data[k] == ',') {
|
|
k++;
|
|
break;
|
|
} if (_data[k] == '}' ) {
|
|
SecondElement = firstElement;
|
|
goto allIsSet;
|
|
} else if(u32char::isInteger(_data[k]) == true) {
|
|
firstElement *= 10;
|
|
firstElement += u32char::toInt(_data[k]);
|
|
} else {
|
|
TK_ERROR("Can not parse this element " << (char)_data[k] << " at pos " << k);
|
|
return false;
|
|
}
|
|
k++;
|
|
}
|
|
if (k == (int64_t)_data.size()) {
|
|
SecondElement = firstElement;
|
|
}
|
|
while(k < (int64_t)_data.size()) {
|
|
if (_data[k] == ',') {
|
|
TK_ERROR("Can not find a second , in {} at pos " << k);
|
|
return false;
|
|
} if (_data[k] == '}') {
|
|
goto allIsSet;
|
|
} else if (true == u32char::isInteger(_data[k])) {
|
|
SecondElement *= 10;
|
|
SecondElement += u32char::toInt(_data[k]);
|
|
} else {
|
|
TK_ERROR("Can not parse this element " << _data[k] << " at pos " << k);
|
|
return false;
|
|
}
|
|
k++;
|
|
}
|
|
|
|
allIsSet:
|
|
if (SecondElement == 0 && firstElement != 0) {
|
|
_min = 0;
|
|
_max = firstElement;
|
|
} else {
|
|
_min = firstElement;
|
|
_max = SecondElement;
|
|
}
|
|
if (_min > _max) {
|
|
TK_ERROR("Minimum=" << _min << " can not be < maximum=" << _max );
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|