etk/etk/RegEx.cpp

487 lines
16 KiB
C++

/**
* @author Edouard DUPIN
*
* @copyright 2011, Edouard DUPIN, all right reserved
*
* @license MPL v2.0 (see license file)
*/
#include <etk/RegEx.hpp>
const struct etk::conversionTable etk::regex::constConversionTable[] = {
// haveBackSlash, inputValue, newValue
{ false , '(' , 0 , etk::regexOpcodePTheseIn},
{ true , '(' , '(' , etk::regexOpcodeError},
{ false , ')' , 0 , etk::regexOpcodePTheseOut},
{ true , ')' , ')' , etk::regexOpcodeError},
{ false , '[' , 0 , etk::regexOpcodeBracketIn},
{ true , '[' , '[' , etk::regexOpcodeError},
{ false , ']' , 0 , etk::regexOpcodeBracketOut},
{ true , ']' , ']' , etk::regexOpcodeError},
{ false , '{' , 0 , etk::regexOpcodeBraceIn},
{ true , '{' , '{' , etk::regexOpcodeError},
{ false , '}' , 0 , etk::regexOpcodeBraceOut},
{ true , '}' , '}' , etk::regexOpcodeError},
{ false , '-' , 0 , etk::regexOpcodeTo},
{ true , '-' , '-' , etk::regexOpcodeError},
{ false , '*' , 0 , etk::regexOpcodeStar},
{ true , '*' , '*' , etk::regexOpcodeError},
{ false , '.' , 0 , etk::regexOpcodeDot},
{ true , '.' , '.' , etk::regexOpcodeError},
{ true , 'e' , 0 , etk::regexOpcodeEOF},
{ false , 'e' , 'e' , etk::regexOpcodeError},
{ false , '?' , 0 , etk::regexOpcodeQuestion},
{ true , '?' , '?' , etk::regexOpcodeError},
{ false , '+' , 0 , etk::regexOpcodePlus},
{ true , '+' , '+' , etk::regexOpcodeError},
{ false , '|' , 0 , etk::regexOpcodePipe},
{ true , '|' , '|' , etk::regexOpcodeError},
{ false , '^' , 0 , etk::regexOpcodeStartOfLine},
{ true , '^' , '^' , etk::regexOpcodeError},
{ false , '$' , 0 , etk::regexOpcodeEndOfLine},
{ true , '$' , '$' , etk::regexOpcodeError},
{ true , 'd' , 0 , etk::regexOpcodeDigit},
{ true , 'D' , 0 , etk::regexOpcodeDigitNot},
{ true , 'l' , 0 , etk::regexOpcodeLetter},
{ true , 'L' , 0 , etk::regexOpcodeLetterNot},
{ true , 's' , 0 , etk::regexOpcodeSpace},
{ true , 'S' , 0 , etk::regexOpcodeSpaceNot},
{ true , 'w' , 0 , etk::regexOpcodeWord},
{ true , 'W' , 0 , etk::regexOpcodeWordNot},
{ true , 'a' , '\a', etk::regexOpcodeError},
{ true , 'b' , '\b', etk::regexOpcodeError},
{ true , 'e' , 0x1B, etk::regexOpcodeError}, // Escape character <Esc>
{ true , 'f' , '\f', etk::regexOpcodeError},
{ true , 'n' , '\n', etk::regexOpcodeError},
{ true , 'r' , '\r', etk::regexOpcodeError},
{ true , 't' , '\t', etk::regexOpcodeError},
{ true , 'v' , '\v', etk::regexOpcodeError},
{ true , '\\' , '\\', etk::regexOpcodeError},
{ true , '&' , '&' , etk::regexOpcodeError},
{ true , '0' , '\0', etk::regexOpcodeError},
{ true , '@' , 0 , etk::regexOpcodeNoChar},
};
const int64_t etk::regex::constConversionTableSize = sizeof(etk::regex::constConversionTable) / sizeof(struct etk::conversionTable) ;
static const char* parseStatusTable[] = {
"parseStatusUnknown",
"parseStatusNone",
"parseStatusPartial",
"parseStatusFull"
};
std::ostream& etk::regex::operator <<(std::ostream& _os, enum etk::regex::parseStatus _obj) {
_os << parseStatusTable[_obj];
return _os;
}
std::ostream& etk::regex::operator <<(std::ostream& _os, const etk::regex::FindProperty& _obj) {
_os << "property([" << _obj.getPositionStart() << "," << _obj.getPositionStop() << "]*" << _obj.getMultiplicity() << " " << _obj.getStatus() << ")";
return _os;
}
etk::String etk::regex::createString(const std::vector<char32_t>& _data, int64_t _start, int64_t _stop) {
etk::String output(ETK_BASH_COLOR_NORMAL);
for (int64_t iii=_start; iii<(int64_t)_data.size() && iii<_stop ; iii++) {
switch(_data[iii]) {
case regexOpcodePTheseIn: output += etk::String(ETK_BASH_COLOR_RED) + (char*)"(" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodePTheseOut: output += etk::String(ETK_BASH_COLOR_RED) + (char*)")" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeBracketIn: output += etk::String(ETK_BASH_COLOR_YELLOW) + (char*)"[" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeBracketOut: output += etk::String(ETK_BASH_COLOR_YELLOW) + (char*)"]" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeTo: output += etk::String(ETK_BASH_COLOR_YELLOW) + (char*)"-" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeBraceIn: output += etk::String(ETK_BASH_COLOR_GREEN) + (char*)"{" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeBraceOut: output += etk::String(ETK_BASH_COLOR_GREEN) + (char*)"}" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeStar: output += etk::String(ETK_BASH_COLOR_BLUE) + (char*)"*" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeDot: output += etk::String(ETK_BASH_COLOR_BLUE) + (char*)"." + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeQuestion: output += etk::String(ETK_BASH_COLOR_BLUE) + (char*)"?" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodePlus: output += etk::String(ETK_BASH_COLOR_BLUE) + (char*)"+" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodePipe: output += etk::String(ETK_BASH_COLOR_BLUE) + (char*)"|" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeNoChar: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"@" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeStartOfLine: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"^" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeEndOfLine: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"$" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeDigit: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\d" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeDigitNot: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\D" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeLetter: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\l" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeLetterNot: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\L" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeSpace: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\s" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeSpaceNot: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\S" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeWord: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\w" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeWordNot: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\W" + ETK_BASH_COLOR_NORMAL; break;
case regexOpcodeEOF: output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\e" + ETK_BASH_COLOR_NORMAL; break;
case '\n': output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\n" + ETK_BASH_COLOR_NORMAL; break;
case '\t': output += etk::String(ETK_BASH_COLOR_MAGENTA) + (char*)"\\t" + ETK_BASH_COLOR_NORMAL; break;
default:
char plop[10];
int8_t nb = u32char::convertUtf8(_data[iii], plop);
plop[nb] = '\0';
output += plop;
break;
}
}
return output;
}
char* etk::regex::levelSpace(uint32_t _level) {
static char* tmpSpace = " ";
if (_level>30) {
return tmpSpace;
}
return tmpSpace + 60 - 2*_level;
}
int64_t etk::regex::getLenOfPTheseElement(const std::vector<char32_t>& _data, int64_t _startPos) {
if (_startPos>=(int64_t)_data.size()){
return 0;
}
int64_t pos = _startPos;
int32_t nbOpen = 0;
// special case of the (...) or | ==> we search '|' or ')'
if( _data[pos] == regexOpcodePTheseOut
|| _data[pos] == regexOpcodePipe) {
return 0;
}
// find size ...
while (pos < (int64_t)_data.size() ) {
if(_data[pos] == regexOpcodePTheseIn) {
// find a sub section :
nbOpen++;
} else if(0 < nbOpen) {
if (_data[pos] == regexOpcodePTheseOut) {
nbOpen--;
if (0 > nbOpen) {
TK_ERROR("Error in the (...) find element at "<< pos);
return -1;
}
}
} else if( _data[pos] == regexOpcodePTheseOut
|| _data[pos] == regexOpcodePipe) {
// Find the end of the (...)
// just return the size inside
int32_t sizeInside = pos - _startPos;
if (0 >= sizeInside) {
TK_ERROR("Error in the (...) no data at "<< pos-1);
return 0;
}
return sizeInside;
}
pos++;
}
return pos - _startPos;
}
int64_t etk::regex::getLenOfPThese(const std::vector<char32_t>& _data, int64_t _startPos) {
int64_t pos = _startPos;
int32_t nbOpen = 0;
// special case of the (...) or | ==> we search '|' or ')'
if(_data[pos]==regexOpcodePTheseOut) {
return 0;
}
if(_data[pos]!=regexOpcodePTheseIn) {
TK_ERROR(" find error in PThese");
return 0;
}
pos++;
// find size ...
while (pos < (int64_t)_data.size() ) {
if(_data[pos]==regexOpcodePTheseIn) {
// find a sub section :
nbOpen++;
} else if(0 < nbOpen) {
if (_data[pos]==regexOpcodePTheseOut) {
nbOpen--;
if (0 > nbOpen) {
TK_ERROR("Error in the (...) find element at "<< pos);
return 0;
}
}
} else if(_data[pos]==regexOpcodePTheseOut) {
// Find the end of the (...)
// just return the size inside
int32_t sizeInside = pos - _startPos-1;
if (0 >= sizeInside) {
TK_ERROR("Error in the (...) no data at "<< pos-1);
return 0;
}
return sizeInside;
}
pos++;
}
return 0;
}
int64_t etk::regex::getLenOfBracket(const std::vector<char32_t>& _data, int64_t _startPos) {
int64_t pos = _startPos;
// special case of the (...) or | ==> we search '|' or ')'
if(_data[pos]==regexOpcodeBracketOut) {
return 0;
}
if(_data[pos] != regexOpcodeBracketIn) {
TK_ERROR("find no [...");
return 0;
}
pos++;
// find size ...
while (pos < (int64_t)_data.size() ) {
if(_data[pos] == regexOpcodeBracketOut) {
// Find the end of the [...]
// just return the size inside
int32_t sizeInside = pos - _startPos -1 ;
if (0 >= sizeInside) {
TK_ERROR("Error in the [...] no data at "<< pos-1);
return 0;
}
return sizeInside;
} else if ( _data[pos] == regexOpcodeStartOfLine
|| _data[pos] == regexOpcodeDigit
|| _data[pos] == regexOpcodeLetter
|| _data[pos] == regexOpcodeSpace
|| _data[pos] == regexOpcodeWord
|| _data[pos] == regexOpcodeTo) {
// nothing to do ... it is permitted
} else if(_data[pos] > 0xFF ) {
etk::String displayElement;
if (_data[pos] == regexOpcodeStartOfLine) {
displayElement = "^";
} else if (_data[pos] == regexOpcodeDigitNot) {
displayElement = "\\D";
} else if (_data[pos] == regexOpcodeLetterNot) {
displayElement = "\\L";
} else if (_data[pos] == regexOpcodeSpaceNot) {
displayElement = "\\S";
} else if (_data[pos] == regexOpcodeWordNot) {
displayElement = "\\W";
} else {
displayElement = (char)_data[pos];
}
TK_ERROR("Error in the [...] not permitted element at "<< pos << " '" << displayElement << "'");
return 0;
}
pos++;
}
return 0;
}
int64_t etk::regex::getLenOfBrace(const std::vector<char32_t>& _data, int64_t _startPos) {
int32_t pos = _startPos;
// special case of the (...) or | ==> we search '|' or ')'
if(_data[pos]==regexOpcodeBraceOut) {
return 0;
}
if(_data[pos]!=regexOpcodeBraceIn) {
TK_ERROR(" did not find brace IN { ");
return 0;
}
pos++;
// find size ...
while (pos < (int64_t)_data.size() ) {
if(_data[pos]==regexOpcodeBraceOut) {
// Find the end of the [...]
// just return the size inside
int32_t sizeInside = pos - _startPos -1 ;
if (0 >= sizeInside) {
TK_ERROR("Error in the {...} no data at "<< pos-1);
return 0;
}
return sizeInside;
} else if( _data[pos] != ','
&& ( _data[pos] < '0'
|| _data[pos] > '9') ) {
TK_ERROR("Error in the {...} not permitted element at "<< pos << " '" << _data[pos] << "'");
return 0;
}
pos++;
}
return 0;
}
int64_t etk::regex::getLenOfNormal(const std::vector<char32_t>& _data, int64_t _startPos) {
int64_t pos = _startPos;
// find size ...
while (pos < (int64_t)_data.size() ) {
switch(_data[pos]) {
case regexOpcodePTheseIn:
case regexOpcodePTheseOut:
case regexOpcodeBracketIn:
case regexOpcodeBracketOut:
case regexOpcodeBraceIn:
case regexOpcodeBraceOut:
case regexOpcodeTo:
case regexOpcodeStar:
case regexOpcodeDot:
case regexOpcodeQuestion:
case regexOpcodePlus:
case regexOpcodePipe:
case regexOpcodeStartOfLine:
case regexOpcodeEndOfLine:
case regexOpcodeDigit:
case regexOpcodeDigitNot:
case regexOpcodeLetter:
case regexOpcodeLetterNot:
case regexOpcodeSpace:
case regexOpcodeSpaceNot:
case regexOpcodeWord:
case regexOpcodeWordNot:
{
// just return the size inside
int32_t sizeInside = pos - _startPos;
if (0 >= sizeInside) {
TK_ERROR("Error in the normal data : no data ...");
}
return sizeInside;
}
break;
default :
// nothing to do ...
break;
}
pos++;
}
if ((int64_t)pos - (int64_t)_startPos < 0) {
return 0;
}
return pos - _startPos ;
}
bool etk::regex::parseBrace(const std::vector<char32_t>& _data, uint32_t& _min, uint32_t& _max) {
int64_t k=0;
int32_t firstElement = 0;
int32_t SecondElement = 0;
while(k < (int64_t)_data.size()) {
if (_data[k] == ',') {
k++;
break;
} if (_data[k] == '}' ) {
SecondElement = firstElement;
goto allIsSet;
} else if(u32char::isInteger(_data[k]) == true) {
firstElement *= 10;
firstElement += u32char::toInt(_data[k]);
} else {
TK_ERROR("Can not parse this element " << (char)_data[k] << " at pos " << k);
return false;
}
k++;
}
if (k == (int64_t)_data.size()) {
SecondElement = firstElement;
}
while(k < (int64_t)_data.size()) {
if (_data[k] == ',') {
TK_ERROR("Can not find a second , in {} at pos " << k);
return false;
} if (_data[k] == '}') {
goto allIsSet;
} else if (true == u32char::isInteger(_data[k])) {
SecondElement *= 10;
SecondElement += u32char::toInt(_data[k]);
} else {
TK_ERROR("Can not parse this element " << _data[k] << " at pos " << k);
return false;
}
k++;
}
allIsSet:
if (SecondElement == 0 && firstElement != 0) {
_min = 0;
_max = firstElement;
} else {
_min = firstElement;
_max = SecondElement;
}
if (_min > _max) {
TK_ERROR("Minimum=" << _min << " can not be < maximum=" << _max );
return false;
}
return true;
}
etk::String etk::regex::autoStr(const etk::String& _data) {
etk::String out;
for (auto &it : _data) {
if (it == '\n') {
out += "\\n";
} else if (it == '\t') {
out += "\\t";
} else if (it == '\r') {
out += "\\r";
} else if (it == '\0') {
out += "\\0";
} else if (it == ' ') {
out += " ";
} else if (it <= 0x20) {
out += std::to_string((int32_t)it);
} else {
out += it;
}
}
return out;
}
etk::String etk::regex::autoStr(char _data) {
etk::String out;
if (_data == '\n') {
out += "\\n";
} else if (_data == '\t') {
out += "\\t";
} else if (_data == '\r') {
out += "\\r";
} else if (_data == '\0') {
out += "\\0";
} else if (_data == ' ') {
out += " ";
} else if (_data <= 0x20) {
out += std::to_string((int32_t)_data);
} else {
out += _data;
}
return out;
}
etk::String etk::regex::strTick(int32_t _pos) {
etk::String out;
for (int32_t iii=0; iii<_pos; ++iii) {
out += " ";
}
out += "^";
return out;
}
namespace etk {
template<> etk::String to_string<etk::RegEx<etk::String>>(const etk::RegEx<etk::String>& _val) {
return _val.getRegEx();
}
template<> etk::String to_string<etk::RegEx<std::u32string>>(const etk::RegEx<std::u32string>& _val) {
return _val.getRegEx();
}
template<> std::u32string to_u32string<etk::RegEx<etk::String>>(const etk::RegEx<etk::String>& _val) {
return _val.getURegEx();
}
template<> std::u32string to_u32string<etk::RegEx<std::u32string>>(const etk::RegEx<std::u32string>& _val) {
return _val.getURegEx();
}
template<> bool from_string<etk::RegEx<etk::String>>(etk::RegEx<etk::String>& _variableRet, const std::u32string& _value) {
_variableRet.compile(_value);
return true;
}
template<> bool from_string<etk::RegEx<std::u32string>>(etk::RegEx<std::u32string>& _variableRet, const std::u32string& _value) {
_variableRet.compile(_value);
return true;
}
template<> bool from_string<etk::RegEx<etk::String>>(etk::RegEx<etk::String>& _variableRet, const etk::String& _value) {
_variableRet.compile(_value);
return true;
}
template<> bool from_string<etk::RegEx<std::u32string>>(etk::RegEx<std::u32string>& _variableRet, const etk::String& _value) {
_variableRet.compile(_value);
return true;
}
};