Add escape/unescape functions

2025-10-27 11:06:50 +01:00 · 2015-12-07 17:31:49 +01:00
parent 2266fc19be
commit 7086d93324
2 changed files with 220 additions and 7 deletions
--- a/Foundation/include/Poco/UTF8String.h
+++ b/Foundation/include/Poco/UTF8String.h
@@ -58,6 +58,20 @@ struct Foundation_API UTF8
 	static void removeBOM(std::string& str);
 		/// Remove the UTF-8 Byte Order Mark sequence (0xEF, 0xBB, 0xBF)
 		/// from the beginning of the string, if it's there.
 	static std::string escape(const std::string& s);
 		/// Escapes a string. Special characters like tab, backslash, ... are
 		/// escaped. Unicode characters are escaped to \uxxxx.
 	static std::string escape(const std::string::const_iterator& begin, const std::string::const_iterator& end);
 		/// Escapes a string. Special characters like tab, backslash, ... are
 		/// escaped. Unicode characters are escaped to \uxxxx.
 	static std::string unescape(const std::string& s);
 		/// Creates an UTF8 string from a string that contains escaped characters.
 	static std::string unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end);
 		/// Creates an UTF8 string from a string that contains escaped characters.
 };
--- a/Foundation/src/UTF8String.cpp
+++ b/Foundation/src/UTF8String.cpp
@@ -19,6 +19,8 @@
 #include "Poco/TextIterator.h"
 #include "Poco/TextConverter.h"
 #include "Poco/UTF8Encoding.h"
 #include "Poco/NumberFormatter.h"
 #include "Poco/Ascii.h"
 #include <algorithm>
@@ -171,5 +173,202 @@ void UTF8::removeBOM(std::string& str)
 	}
 }
 std::string UTF8::escape(const std::string &s)
 {
 	return escape(s.begin(), s.end());
 }
 std::string UTF8::escape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
 {
 	static Poco::UInt32 offsetsFromUTF8[6] = {
 		0x00000000UL, 0x00003080UL, 0x000E2080UL,
 		0x03C82080UL, 0xFA082080UL, 0x82082080UL
 	};
 	std::string result;
 	std::string::const_iterator it = begin;
 	while(it != end)
 	{
 		Poco::UInt32 ch = 0;
 		unsigned int sz = 0;
 		do
 		{
 			ch <<= 6;
 			ch += (unsigned char)*it++;
 			sz++;
 		}
 		while (it != end && (*it & 0xC0) == 0x80 && sz < 6);
 		ch -= offsetsFromUTF8[sz-1];
 		if (ch == '\n') result += "\\n";
 		else if (ch == '\t') result += "\\t";
 		else if (ch == '\r') result += "\\r";
 		else if (ch == '\b') result += "\\b";
 		else if (ch == '\f') result += "\\f";
 		else if (ch == '\v') result += "\\v";
 		else if (ch == '\a') result += "\\a";
 		else if (ch == '\\') result +=  "\\\\";
 		else if (ch == '\"') result +=  "\\\"";
 		else if (ch == '/') result +=  "\\/";
 		else if (ch == '\0') result += "\\u0000";
 		else if (ch < 32 || ch == 0x7f)
 		{
 			result += "\\u";
 			NumberFormatter::appendHex(result, (unsigned short) ch, 4);
 		}
 		else if (ch > 0xFFFF)
 		{
 			ch -= 0x10000;
 			result += "\\u";
 			NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4);
 			result += "\\u";
 			NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4);
 		}
 		else if (ch >= 0x80 && ch <= 0xFFFF)
 		{
 			result += "\\u";
 			NumberFormatter::appendHex(result, (unsigned short) ch, 4);
 		}
 		else
 		{
 			result += (char) ch;
 		}
 	}
 	return result;
 }
 std::string UTF8::unescape(const std::string &s)
 {
 	return unescape(s.begin(), s.end());
 }
 std::string UTF8::unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
 {
 	std::string result;
 	std::string::const_iterator it = begin;
 	while (it != end)
 	{
 		Poco::UInt32 ch = (Poco::UInt32) *it++;
 		if (ch == '\\')
 		{
 			if ( it == end )
 			{
 				//Invalid sequence!
 			}
 			if (*it == 'n')
 			{
 				ch = '\n';
 				it++;
 			}
 			else if (*it == 't')
 			{
 				ch = '\t';
 				it++;
 			}
 			else if (*it == 'r')
 			{
 				ch = '\r';
 				it++;
 			}
 			else if (*it == 'b')
 			{
 				ch = '\b';
 				it++;
 			}
 			else if (*it == 'f')
 			{
 				ch = '\f';
 				it++;
 			}
 			else if (*it == 'v')
 			{
 				ch = '\v';
 				it++;
 			}
 			else if (*it == 'a')
 			{
 				ch = '\a';
 				it++;
 			}
 			else if (*it == 'u')
 			{
 				char digs[5];
 				memset(digs, 0, 5);
 				unsigned int dno = 0;
 				it++;
 				while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
 				if (dno > 0)
 				{
 					ch = strtol(digs, NULL, 16);
 				}
 				if( ch >= 0xD800 && ch <= 0xDBFF )
 				{
 					if ( it == end || *it != '\\' )
 					{
 						//Invalid sequence!
 					}
 					else
 					{
 						it++;
 						if ( it == end || *it != 'u' )
 						{
 							//Invalid sequence!
 						}
 						else
 						{
 							it++;
 						}
 					}
 					// UTF-16 surrogate pair. Go fetch other half
 					memset(digs, 0, 5);
 					dno = 0;
 					while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
 					if (dno > 0)
 					{
 						Poco::UInt32 temp = strtol(digs, NULL, 16);
 						if( temp >= 0xDC00 && temp <= 0xDFFF )
 						{
 							ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000;
 						}
 					}
 				}
 			}
 			else if (*it == 'U')
 			{
 				char digs[9];
 				memset(digs, 0, 9);
 				unsigned int dno = 0;
 				it++;
 				while (it != end && Ascii::isHexDigit(*it) && dno < 8)
 				{
 					digs[dno++] = *it++;
 				}
 				if (dno > 0)
 				{
 					ch = strtol(digs, NULL, 16);
 				}
 			}
 		}
 		unsigned char utf8[4];
 		UTF8Encoding encoding;
 		int sz = encoding.convert(ch, utf8, 4);
 		result.append((char*) utf8, sz);
 	}
 	return result;
 }
 } // namespace Poco