unescape Backslash char in UTF8 unescape method

This commit is contained in:
Kacper Piwiński 2019-12-16 22:43:40 +01:00
parent 9c197e0ed1
commit 29de485a46
3 changed files with 54 additions and 28 deletions

View File

@ -50,3 +50,4 @@ Jeff Adams
Martin Osborne
Björn Schramke
Francis Andre
Kacper Piwiński

View File

@ -32,11 +32,11 @@ namespace
int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2)
{
{
std::string::size_type sz = str.size();
if (pos > sz) pos = sz;
if (pos + n > sz) n = sz - pos;
TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8);
TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8);
TextIterator uend1(str.begin() + pos + n);
TextIterator uit2(it2, end2, utf8);
TextIterator uend2(end2);
@ -50,7 +50,7 @@ int UTF8::icompare(const std::string& str, std::string::size_type pos, std::stri
return 1;
++uit1; ++uit2;
}
if (uit1 == uend1)
return uit2 == uend2 ? 0 : -1;
else
@ -162,9 +162,9 @@ std::string& UTF8::toLowerInPlace(std::string& str)
void UTF8::removeBOM(std::string& str)
{
if (str.size() >= 3
&& static_cast<unsigned char>(str[0]) == 0xEF
&& static_cast<unsigned char>(str[1]) == 0xBB
if (str.size() >= 3
&& static_cast<unsigned char>(str[0]) == 0xEF
&& static_cast<unsigned char>(str[1]) == 0xBB
&& static_cast<unsigned char>(str[2]) == 0xBF)
{
str.erase(0, 3);
@ -264,42 +264,74 @@ std::string UTF8::unescape(const std::string::const_iterator& begin, const std::
//Invalid sequence!
}
if (*it == 'n')
switch (*it)
{
case 'U':
{
char digs[9];
std::memset(digs, 0, 9);
unsigned int dno = 0;
it++;
while (it != end && Ascii::isHexDigit(*it) && dno < 8)
{
digs[dno++] = *it++;
}
if (dno > 0)
{
ch = std::strtol(digs, NULL, 16);
}
break;
}
case '\\':
{
ch = '\\';
it++;
break;
}
case 'n':
{
ch = '\n';
it++;
break;
}
else if (*it == 't')
case 't':
{
ch = '\t';
it++;
break;
}
else if (*it == 'r')
case 'r':
{
ch = '\r';
it++;
break;
}
else if (*it == 'b')
case 'b':
{
ch = '\b';
it++;
break;
}
else if (*it == 'f')
case 'f':
{
ch = '\f';
it++;
break;
}
else if (*it == 'v')
case 'v':
{
ch = '\v';
it++;
break;
}
else if (*it == 'a')
case 'a':
{
ch = '\a';
it++;
break;
}
else if (*it == 'u')
case 'u':
{
char digs[5];
std::memset(digs, 0, 5);
@ -345,23 +377,14 @@ std::string UTF8::unescape(const std::string::const_iterator& begin, const std::
}
}
}
break;
}
else if (*it == 'U')
default:
{
char digs[9];
std::memset(digs, 0, 9);
unsigned int dno = 0;
it++;
while (it != end && Ascii::isHexDigit(*it) && dno < 8)
{
digs[dno++] = *it++;
}
if (dno > 0)
{
ch = std::strtol(digs, NULL, 16);
}
//Invalid sequence!
break;
}
}//end switch
}
unsigned char utf8[4];

View File

@ -96,9 +96,11 @@ void UTF8StringTest::testUnescape()
{
std::string s1("A \\t, a \\u000B, and an \\u0007 walk into a |, and the barman says \\u0402");
std::string s2("A \\t, a \\v, and an \\a walk into a |, and the barman says \\u0402");
std::string s3("\\\\");
assertTrue (UTF8::unescape(s1) == "A \t, a \v, and an \a walk into a |, and the barman says \xD0\x82");
assertTrue (UTF8::unescape(s2) == "A \t, a \v, and an \a walk into a |, and the barman says \xD0\x82");
assertTrue (UTF8::unescape(s3) == "\\");
}