mirror of
https://github.com/pocoproject/poco.git
synced 2025-12-10 18:14:58 +01:00
* chore(PCRE): properly detect library type on newer macOS * chore(ZLIB): move source files to own zlib directory and update CMake files. * chore(PCRE): move source files to own pcre2 directory and update CMake files. * chore(UTF8PROC): move source files to own utf8proc directory and update CMake files. * chore(ZLIB): remove header files * chore(PDJSON): move source files to own pdjson directory and update CMake files. * chore(SQLite3): move source files to own sqlite3 directory and update CMake files. * chore(UNBUNDLED): Correct includes. * chore(expat): move source files to own expat directory and update CMake files. * chore(wepoll): move source files to own wepoll directory and update CMake files. * chore(7zip): move source files to own 7zip directory and update CMake files. * chore(CMake): fix compile and link flags for dependent static libraries * chore(CMake): set PCRE2_STATIC when building PCRE2. * chore(SQLite3): Set SQLITE_THREADSAFE for unbundled build, add warnings. * chore(CMake): Modifications to build and link properly static target libraries (using OBJECT library type and link using BUILD_LOCAL_INTERFACE) * chore(CMake): fix order of includes in main CMakeLists.txt. * chore(CI): Build mysql tests with cmake. * chore(CI): Build mongodb, redis, sqlite no parser tests with cmake. * chore(CI): Build odbc tests with cmake. * chore(CI): Build more ations with cmake, other fixes. * chore(CI): Fixes for macOS * chore(CMake): extract hpdf and png files to own directories in dependencies * fix(CMake): include dependencies after all module dependencies are resolved. * fix(CMake): Improve dependency handling of dependencies to compile them only when necessary. * fix(CMake): PDF: move t4.h to proper directory, modify include. * fix(CMake): Fixes to link properly on all platforms. * fix(CMAKE): Wrong ENABLE for SQLITE * enh(PDF): Remove dependencies on hpdf headers from Poco::PDF interface and make usage of hpdf only internal. * enh(CI): Convert more jobs to use cmake. * enh(CI): Convert macOS sanitizer jobs to use cmake. * enh(mkrelease): Copy dependencies when creating release package. * eng(CMake): Add missing POCO_SO option to enable/disable small object optimization. * enh(CI): Run linux sanitizer with cmake, various fixes and improvements. * fix(CMake): bundled build: ZLIB::ZLIB is already linked with Foundation, no need to link again to Poco::Zip * fix(CI): vptr undefined sanitizer causes foundation tests to fail when linking, disable it * chore(tests): Minor code improvements. * fix(AsyncNotificationCenter): fix a data race with member _listsEmpty by making it atomic. * eng(CI): Add a few more time sensitive tests to cppignore.lnx * chore(Thread): Code updates. * eng(CI): Add a few more time sensitive tests to cppignore.lnx * fix(AsyncNotificationCenter): must join threads to avoid data race in dtor. * chore(CI): Pass TSAN_OPTIONS to jobs where necessary * chore(CI): run rests without sudo, compile with parallelism * chore(CI): Use POCO_MINIMAL_BUILD to simplify CMake configure lines. * chore(CI): Add 32-bit Windows VS build * chore(CMake): Printout cmake generator platform. * chore(CMake): linux-gcc-make-armv7l -> linux-gcc-cmake-armv7l * chore(ci): windows-2025-msvc-cmake-32bit -> windows-2025-msvc-cmake-Win32 * chore(CI): Convert all remaining jobs to CMake. * chore(make): Prevent building with make. * chore(CodeQL): exclude all external code from CodeQL checks. * chore(macOS): Set min support version to 13.3 to properly support C++20 standard.
466 lines
10 KiB
C++
466 lines
10 KiB
C++
//
|
|
// UTF8String.cpp
|
|
//
|
|
// Library: Foundation
|
|
// Package: Text
|
|
// Module: UTF8String
|
|
//
|
|
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
|
|
// and Contributors.
|
|
//
|
|
// SPDX-License-Identifier: BSL-1.0
|
|
//
|
|
|
|
|
|
#include "Poco/UTF8String.h"
|
|
#include "Poco/UTFString.h"
|
|
#include "Poco/Unicode.h"
|
|
#include "Poco/TextIterator.h"
|
|
#include "Poco/TextConverter.h"
|
|
#include "Poco/UTF8Encoding.h"
|
|
#include "Poco/NumberFormatter.h"
|
|
#include "Poco/Ascii.h"
|
|
#include "Poco/Buffer.h"
|
|
#include "Poco/Exception.h"
|
|
#include <algorithm>
|
|
#include <iterator>
|
|
#include <utf8proc.h>
|
|
|
|
|
|
#if !defined(POCO_OS_FAMILY_WINDOWS)
|
|
|
|
#if defined(POCO_USE_STRING16)
|
|
template class std::basic_string<Poco::UTF16Char, Poco::UTF16CharTraits>;
|
|
#endif
|
|
|
|
#if defined(POCO_USE_STRING32)
|
|
template class std::basic_string<Poco::UTF32Char, Poco::UTF32CharTraits>;
|
|
#endif
|
|
|
|
#endif
|
|
|
|
namespace Poco {
|
|
|
|
|
|
namespace
|
|
{
|
|
static UTF8Encoding utf8;
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2)
|
|
{
|
|
std::string::size_type sz = str.size();
|
|
if (pos > sz) pos = sz;
|
|
if (pos + n > sz) n = sz - pos;
|
|
TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8);
|
|
TextIterator uend1(str.begin() + pos + n);
|
|
TextIterator uit2(it2, end2, utf8);
|
|
TextIterator uend2(end2);
|
|
while (uit1 != uend1 && uit2 != uend2)
|
|
{
|
|
int c1 = Unicode::toLower(*uit1);
|
|
int c2 = Unicode::toLower(*uit2);
|
|
if (c1 < c2)
|
|
return -1;
|
|
else if (c1 > c2)
|
|
return 1;
|
|
++uit1; ++uit2;
|
|
}
|
|
|
|
if (uit1 == uend1)
|
|
return uit2 == uend2 ? 0 : -1;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str1, const std::string& str2)
|
|
{
|
|
return icompare(str1, 0, str1.size(), str2.begin(), str2.end());
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str1, std::string::size_type n1, const std::string& str2, std::string::size_type n2)
|
|
{
|
|
if (n2 > str2.size()) n2 = str2.size();
|
|
return icompare(str1, 0, n1, str2.begin(), str2.begin() + n2);
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str1, std::string::size_type n, const std::string& str2)
|
|
{
|
|
if (n > str2.size()) n = str2.size();
|
|
return icompare(str1, 0, n, str2.begin(), str2.begin() + n);
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str1, std::string::size_type pos, std::string::size_type n, const std::string& str2)
|
|
{
|
|
return icompare(str1, pos, n, str2.begin(), str2.end());
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n1, const std::string& str2, std::string::size_type pos2, std::string::size_type n2)
|
|
{
|
|
std::string::size_type sz2 = str2.size();
|
|
if (pos2 > sz2) pos2 = sz2;
|
|
if (pos2 + n2 > sz2) n2 = sz2 - pos2;
|
|
return icompare(str1, pos1, n1, str2.begin() + pos2, str2.begin() + pos2 + n2);
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n, const std::string& str2, std::string::size_type pos2)
|
|
{
|
|
std::string::size_type sz2 = str2.size();
|
|
if (pos2 > sz2) pos2 = sz2;
|
|
if (pos2 + n > sz2) n = sz2 - pos2;
|
|
return icompare(str1, pos1, n, str2.begin() + pos2, str2.begin() + pos2 + n);
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, const std::string::value_type* ptr)
|
|
{
|
|
poco_check_ptr (ptr);
|
|
std::string str2(ptr); // TODO: optimize
|
|
return icompare(str, pos, n, str2.begin(), str2.end());
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str, std::string::size_type pos, const std::string::value_type* ptr)
|
|
{
|
|
return icompare(str, pos, str.size() - pos, ptr);
|
|
}
|
|
|
|
|
|
int UTF8::icompare(const std::string& str, const std::string::value_type* ptr)
|
|
{
|
|
return icompare(str, 0, str.size(), ptr);
|
|
}
|
|
|
|
|
|
std::string UTF8::toUpper(const std::string& str)
|
|
{
|
|
std::string result;
|
|
TextConverter converter(utf8, utf8);
|
|
converter.convert(str, result, Unicode::toUpper);
|
|
return result;
|
|
}
|
|
|
|
|
|
std::string& UTF8::toUpperInPlace(std::string& str)
|
|
{
|
|
std::string result;
|
|
TextConverter converter(utf8, utf8);
|
|
converter.convert(str, result, Unicode::toUpper);
|
|
std::swap(str, result);
|
|
return str;
|
|
}
|
|
|
|
|
|
std::string UTF8::toLower(const std::string& str)
|
|
{
|
|
std::string result;
|
|
TextConverter converter(utf8, utf8);
|
|
converter.convert(str, result, Unicode::toLower);
|
|
return result;
|
|
}
|
|
|
|
|
|
std::string& UTF8::toLowerInPlace(std::string& str)
|
|
{
|
|
std::string result;
|
|
TextConverter converter(utf8, utf8);
|
|
converter.convert(str, result, Unicode::toLower);
|
|
std::swap(str, result);
|
|
return str;
|
|
}
|
|
|
|
|
|
void UTF8::removeBOM(std::string& str)
|
|
{
|
|
if (str.size() >= 3
|
|
&& static_cast<unsigned char>(str[0]) == 0xEF
|
|
&& static_cast<unsigned char>(str[1]) == 0xBB
|
|
&& static_cast<unsigned char>(str[2]) == 0xBF)
|
|
{
|
|
str.erase(0, 3);
|
|
}
|
|
}
|
|
|
|
|
|
std::string UTF8::escape(const std::string &s, bool strictJSON)
|
|
{
|
|
return escape(s.begin(), s.end(), strictJSON);
|
|
}
|
|
|
|
|
|
std::string UTF8::escape(const std::string::const_iterator& begin, const std::string::const_iterator& end, bool strictJSON, bool lowerCaseHex)
|
|
{
|
|
static Poco::UInt32 offsetsFromUTF8[6] = {
|
|
0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
|
0x03C82080UL, 0xFA082080UL, 0x82082080UL
|
|
};
|
|
|
|
std::string result;
|
|
|
|
std::string::const_iterator it = begin;
|
|
|
|
while(it != end)
|
|
{
|
|
Poco::UInt32 ch = 0;
|
|
unsigned int sz = 0;
|
|
|
|
do
|
|
{
|
|
ch <<= 6;
|
|
ch += (unsigned char)*it++;
|
|
sz++;
|
|
}
|
|
while (it != end && (*it & 0xC0) == 0x80 && sz < 6);
|
|
ch -= offsetsFromUTF8[sz-1];
|
|
|
|
if (ch == '\n') result += "\\n";
|
|
else if (ch == '\t') result += "\\t";
|
|
else if (ch == '\r') result += "\\r";
|
|
else if (ch == '\b') result += "\\b";
|
|
else if (ch == '\f') result += "\\f";
|
|
else if (ch == '\v') result += (strictJSON ? (lowerCaseHex ? "\\u000b" : "\\u000B") : "\\v");
|
|
else if (ch == '\a') result += (strictJSON ? "\\u0007" : "\\a");
|
|
else if (ch == '\\') result += "\\\\";
|
|
else if (ch == '\"') result += "\\\"";
|
|
else if (ch == '/') result += "\\/";
|
|
else if (ch == '\0') result += "\\u0000";
|
|
else if (ch < 32 || ch == 0x7f)
|
|
{
|
|
result += "\\u";
|
|
NumberFormatter::appendHex(result, (unsigned short) ch, 4, lowerCaseHex);
|
|
}
|
|
else if (ch > 0xFFFF)
|
|
{
|
|
ch -= 0x10000;
|
|
result += "\\u";
|
|
NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4, lowerCaseHex);
|
|
result += "\\u";
|
|
NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4, lowerCaseHex);
|
|
}
|
|
else if (ch >= 0x80 && ch <= 0xFFFF)
|
|
{
|
|
result += "\\u";
|
|
NumberFormatter::appendHex(result, (unsigned short) ch, 4, lowerCaseHex);
|
|
}
|
|
else
|
|
{
|
|
result += (char) ch;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
std::string UTF8::unescape(const std::string &s)
|
|
{
|
|
return unescape(s.begin(), s.end());
|
|
}
|
|
|
|
|
|
std::string UTF8::unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
|
|
{
|
|
std::string result;
|
|
|
|
std::string::const_iterator it = begin;
|
|
|
|
while (it != end)
|
|
{
|
|
Poco::UInt32 ch = (Poco::UInt32) *it++;
|
|
|
|
if (ch == '\\')
|
|
{
|
|
if ( it == end )
|
|
{
|
|
//Invalid sequence!
|
|
}
|
|
|
|
switch (*it)
|
|
{
|
|
case 'U':
|
|
{
|
|
char digs[9];
|
|
std::memset(digs, 0, 9);
|
|
unsigned int dno = 0;
|
|
|
|
it++;
|
|
while (it != end && Ascii::isHexDigit(*it) && dno < 8)
|
|
{
|
|
digs[dno++] = *it++;
|
|
}
|
|
if (dno > 0)
|
|
{
|
|
ch = std::strtol(digs, NULL, 16);
|
|
}
|
|
break;
|
|
}
|
|
case '\\':
|
|
{
|
|
ch = '\\';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'n':
|
|
{
|
|
ch = '\n';
|
|
it++;
|
|
break;
|
|
}
|
|
case 't':
|
|
{
|
|
ch = '\t';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'r':
|
|
{
|
|
ch = '\r';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'b':
|
|
{
|
|
ch = '\b';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'f':
|
|
{
|
|
ch = '\f';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'v':
|
|
{
|
|
ch = '\v';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'a':
|
|
{
|
|
ch = '\a';
|
|
it++;
|
|
break;
|
|
}
|
|
case 'u':
|
|
{
|
|
char digs[5];
|
|
std::memset(digs, 0, 5);
|
|
unsigned int dno = 0;
|
|
|
|
it++;
|
|
|
|
while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
|
|
if (dno > 0)
|
|
{
|
|
ch = std::strtol(digs, NULL, 16);
|
|
}
|
|
|
|
if( ch >= 0xD800 && ch <= 0xDBFF )
|
|
{
|
|
if ( it == end || *it != '\\' )
|
|
{
|
|
//Invalid sequence!
|
|
}
|
|
else
|
|
{
|
|
it++;
|
|
if ( it == end || *it != 'u' )
|
|
{
|
|
//Invalid sequence!
|
|
}
|
|
else
|
|
{
|
|
it++;
|
|
}
|
|
}
|
|
|
|
// UTF-16 surrogate pair. Go fetch other half
|
|
std::memset(digs, 0, 5);
|
|
dno = 0;
|
|
while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
|
|
if (dno > 0)
|
|
{
|
|
Poco::UInt32 temp = std::strtol(digs, NULL, 16);
|
|
if( temp >= 0xDC00 && temp <= 0xDFFF )
|
|
{
|
|
ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
{
|
|
//Invalid sequence!
|
|
break;
|
|
}
|
|
}//end switch
|
|
}
|
|
|
|
unsigned char utf8[4];
|
|
UTF8Encoding encoding;
|
|
int sz = encoding.convert(ch, utf8, 4);
|
|
result.append((char*) utf8, sz);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
namespace
|
|
{
|
|
std::string doNormalize(const char* str, std::size_t size, utf8proc_option_t options)
|
|
{
|
|
utf8proc_ssize_t n = utf8proc_decompose_custom(reinterpret_cast<const utf8proc_uint8_t*>(str), size, NULL, 0, options, NULL, NULL);
|
|
if (n < 0) throw Poco::RuntimeException("Normalization decompose failed"s, utf8proc_errmsg(n));
|
|
|
|
Poco::Buffer<utf8proc_int32_t> buffer(n + 1); // utf8proc_reencode() needs space for terminating NUL
|
|
n = utf8proc_decompose_custom(reinterpret_cast<const utf8proc_uint8_t*>(str), size, buffer.begin(), n, options, NULL, NULL);
|
|
if (n < 0) throw Poco::RuntimeException("Normalization decompose failed"s, utf8proc_errmsg(n));
|
|
|
|
n = utf8proc_reencode(buffer.begin(), n, options);
|
|
if (n < 0) throw Poco::RuntimeException("Normalization reeencode failed"s, utf8proc_errmsg(n));
|
|
|
|
return std::string(reinterpret_cast<char*>(buffer.begin()), n);
|
|
}
|
|
|
|
int formToOptions(UTF8::NormalizationForm form)
|
|
{
|
|
switch (form)
|
|
{
|
|
case UTF8::NORMALIZATION_FORM_D:
|
|
return UTF8PROC_STABLE | UTF8PROC_DECOMPOSE;
|
|
case UTF8::NORMALIZATION_FORM_C:
|
|
return UTF8PROC_STABLE | UTF8PROC_COMPOSE;
|
|
case UTF8::NORMALIZATION_FORM_KD:
|
|
return UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT;
|
|
case UTF8::NORMALIZATION_FORM_KC:
|
|
return UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
std::string UTF8::normalize(const std::string& s, NormalizationForm form)
|
|
{
|
|
return doNormalize(s.data(), s.size(), static_cast<utf8proc_option_t>(formToOptions(form)));
|
|
}
|
|
|
|
|
|
std::string UTF8::normalize(const std::string::const_iterator& begin, const std::string::const_iterator& end, NormalizationForm form)
|
|
{
|
|
return doNormalize(&*begin, static_cast<std::size_t>(std::distance(begin, end)), static_cast<utf8proc_option_t>(formToOptions(form)));
|
|
}
|
|
|
|
|
|
} // namespace Poco
|