From 7086d933248d7e2d8a00d1fd5dea5a2994d02879 Mon Sep 17 00:00:00 2001 From: fbraem Date: Mon, 7 Dec 2015 17:31:49 +0100 Subject: [PATCH] Add escape/unescape functions --- Foundation/include/Poco/UTF8String.h | 16 +- Foundation/src/UTF8String.cpp | 211 ++++++++++++++++++++++++++- 2 files changed, 220 insertions(+), 7 deletions(-) diff --git a/Foundation/include/Poco/UTF8String.h b/Foundation/include/Poco/UTF8String.h index d719b609e..596b79b8c 100644 --- a/Foundation/include/Poco/UTF8String.h +++ b/Foundation/include/Poco/UTF8String.h @@ -54,10 +54,24 @@ struct Foundation_API UTF8 static std::string& toUpperInPlace(std::string& str); static std::string toLower(const std::string& str); static std::string& toLowerInPlace(std::string& str); - + static void removeBOM(std::string& str); /// Remove the UTF-8 Byte Order Mark sequence (0xEF, 0xBB, 0xBF) /// from the beginning of the string, if it's there. + + static std::string escape(const std::string& s); + /// Escapes a string. Special characters like tab, backslash, ... are + /// escaped. Unicode characters are escaped to \uxxxx. + + static std::string escape(const std::string::const_iterator& begin, const std::string::const_iterator& end); + /// Escapes a string. Special characters like tab, backslash, ... are + /// escaped. Unicode characters are escaped to \uxxxx. + + static std::string unescape(const std::string& s); + /// Creates an UTF8 string from a string that contains escaped characters. + + static std::string unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end); + /// Creates an UTF8 string from a string that contains escaped characters. }; diff --git a/Foundation/src/UTF8String.cpp b/Foundation/src/UTF8String.cpp index b7c6eb877..56d959f23 100644 --- a/Foundation/src/UTF8String.cpp +++ b/Foundation/src/UTF8String.cpp @@ -19,6 +19,8 @@ #include "Poco/TextIterator.h" #include "Poco/TextConverter.h" #include "Poco/UTF8Encoding.h" +#include "Poco/NumberFormatter.h" +#include "Poco/Ascii.h" #include @@ -32,11 +34,11 @@ namespace int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2) -{ +{ std::string::size_type sz = str.size(); if (pos > sz) pos = sz; if (pos + n > sz) n = sz - pos; - TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8); + TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8); TextIterator uend1(str.begin() + pos + n); TextIterator uit2(it2, end2, utf8); TextIterator uend2(end2); @@ -50,7 +52,7 @@ int UTF8::icompare(const std::string& str, std::string::size_type pos, std::stri return 1; ++uit1; ++uit2; } - + if (uit1 == uend1) return uit2 == uend2 ? 0 : -1; else @@ -162,14 +164,211 @@ std::string& UTF8::toLowerInPlace(std::string& str) void UTF8::removeBOM(std::string& str) { - if (str.size() >= 3 - && static_cast(str[0]) == 0xEF - && static_cast(str[1]) == 0xBB + if (str.size() >= 3 + && static_cast(str[0]) == 0xEF + && static_cast(str[1]) == 0xBB && static_cast(str[2]) == 0xBF) { str.erase(0, 3); } } +std::string UTF8::escape(const std::string &s) +{ + return escape(s.begin(), s.end()); +} + +std::string UTF8::escape(const std::string::const_iterator& begin, const std::string::const_iterator& end) +{ + static Poco::UInt32 offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL + }; + + std::string result; + + std::string::const_iterator it = begin; + + while(it != end) + { + Poco::UInt32 ch = 0; + unsigned int sz = 0; + + do + { + ch <<= 6; + ch += (unsigned char)*it++; + sz++; + } + while (it != end && (*it & 0xC0) == 0x80 && sz < 6); + ch -= offsetsFromUTF8[sz-1]; + + if (ch == '\n') result += "\\n"; + else if (ch == '\t') result += "\\t"; + else if (ch == '\r') result += "\\r"; + else if (ch == '\b') result += "\\b"; + else if (ch == '\f') result += "\\f"; + else if (ch == '\v') result += "\\v"; + else if (ch == '\a') result += "\\a"; + else if (ch == '\\') result += "\\\\"; + else if (ch == '\"') result += "\\\""; + else if (ch == '/') result += "\\/"; + else if (ch == '\0') result += "\\u0000"; + else if (ch < 32 || ch == 0x7f) + { + result += "\\u"; + NumberFormatter::appendHex(result, (unsigned short) ch, 4); + } + else if (ch > 0xFFFF) + { + ch -= 0x10000; + result += "\\u"; + NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4); + result += "\\u"; + NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4); + } + else if (ch >= 0x80 && ch <= 0xFFFF) + { + result += "\\u"; + NumberFormatter::appendHex(result, (unsigned short) ch, 4); + } + else + { + result += (char) ch; + } + } + return result; +} + +std::string UTF8::unescape(const std::string &s) +{ + return unescape(s.begin(), s.end()); +} + +std::string UTF8::unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end) +{ + std::string result; + + std::string::const_iterator it = begin; + + while (it != end) + { + Poco::UInt32 ch = (Poco::UInt32) *it++; + + if (ch == '\\') + { + if ( it == end ) + { + //Invalid sequence! + } + + if (*it == 'n') + { + ch = '\n'; + it++; + } + else if (*it == 't') + { + ch = '\t'; + it++; + } + else if (*it == 'r') + { + ch = '\r'; + it++; + } + else if (*it == 'b') + { + ch = '\b'; + it++; + } + else if (*it == 'f') + { + ch = '\f'; + it++; + } + else if (*it == 'v') + { + ch = '\v'; + it++; + } + else if (*it == 'a') + { + ch = '\a'; + it++; + } + else if (*it == 'u') + { + char digs[5]; + memset(digs, 0, 5); + unsigned int dno = 0; + + it++; + + while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++; + if (dno > 0) + { + ch = strtol(digs, NULL, 16); + } + + if( ch >= 0xD800 && ch <= 0xDBFF ) + { + if ( it == end || *it != '\\' ) + { + //Invalid sequence! + } + else + { + it++; + if ( it == end || *it != 'u' ) + { + //Invalid sequence! + } + else + { + it++; + } + } + + // UTF-16 surrogate pair. Go fetch other half + memset(digs, 0, 5); + dno = 0; + while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++; + if (dno > 0) + { + Poco::UInt32 temp = strtol(digs, NULL, 16); + if( temp >= 0xDC00 && temp <= 0xDFFF ) + { + ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000; + } + } + } + } + else if (*it == 'U') + { + char digs[9]; + memset(digs, 0, 9); + unsigned int dno = 0; + + it++; + while (it != end && Ascii::isHexDigit(*it) && dno < 8) + { + digs[dno++] = *it++; + } + if (dno > 0) + { + ch = strtol(digs, NULL, 16); + } + } + } + + unsigned char utf8[4]; + UTF8Encoding encoding; + int sz = encoding.convert(ch, utf8, 4); + result.append((char*) utf8, sz); + } + + return result; +} } // namespace Poco