mirror of
				https://github.com/pocoproject/poco.git
				synced 2025-10-25 18:22:59 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			328 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			328 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| //
 | |
| // Unicode.h
 | |
| //
 | |
| // Library: Foundation
 | |
| // Package: Text
 | |
| // Module:  Unicode
 | |
| //
 | |
| // Definition of the Unicode class.
 | |
| //
 | |
| // Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
 | |
| // and Contributors.
 | |
| //
 | |
| // SPDX-License-Identifier:	BSL-1.0
 | |
| //
 | |
| 
 | |
| 
 | |
| #ifndef Foundation_Unicode_INCLUDED
 | |
| #define Foundation_Unicode_INCLUDED
 | |
| 
 | |
| 
 | |
| #include "Poco/Foundation.h"
 | |
| 
 | |
| 
 | |
| namespace Poco {
 | |
| 
 | |
| 
 | |
| class Foundation_API Unicode
 | |
| 	/// This class contains enumerations and static
 | |
| 	/// utility functions for dealing with Unicode characters
 | |
| 	/// and their properties.
 | |
| 	///
 | |
| 	/// For more information on Unicode, see <http://www.unicode.org>.
 | |
| 	///
 | |
| 	/// The implementation is based on the Unicode support
 | |
| 	/// functions in PCRE.
 | |
| {
 | |
| public:
 | |
| 	// Implementation note: the following definitions must be kept
 | |
| 	// in sync with those from ucp.h (PCRE).
 | |
| 	enum CharacterCategory
 | |
| 		/// Unicode character categories.
 | |
| 	{
 | |
| 		UCP_OTHER,
 | |
| 		UCP_LETTER,
 | |
| 		UCP_MARK,
 | |
| 		UCP_NUMBER,
 | |
| 		UCP_PUNCTUATION,
 | |
| 		UCP_SYMBOL,
 | |
| 		UCP_SEPARATOR
 | |
| 	};
 | |
| 
 | |
| 	enum CharacterType
 | |
| 		/// Unicode character types.
 | |
| 	{
 | |
| 		UCP_CONTROL,
 | |
| 		UCP_FORMAT,
 | |
| 		UCP_UNASSIGNED,
 | |
| 		UCP_PRIVATE_USE,
 | |
| 		UCP_SURROGATE,
 | |
| 		UCP_LOWER_CASE_LETTER,
 | |
| 		UCP_MODIFIER_LETTER,
 | |
| 		UCP_OTHER_LETTER,
 | |
| 		UCP_TITLE_CASE_LETTER,
 | |
| 		UCP_UPPER_CASE_LETTER,
 | |
| 		UCP_SPACING_MARK,
 | |
| 		UCP_ENCLOSING_MARK,
 | |
| 		UCP_NON_SPACING_MARK,
 | |
| 		UCP_DECIMAL_NUMBER,
 | |
| 		UCP_LETTER_NUMBER,
 | |
| 		UCP_OTHER_NUMBER,
 | |
| 		UCP_CONNECTOR_PUNCTUATION,
 | |
| 		UCP_DASH_PUNCTUATION,
 | |
| 		UCP_CLOSE_PUNCTUATION,
 | |
| 		UCP_FINAL_PUNCTUATION,
 | |
| 		UCP_INITIAL_PUNCTUATION,
 | |
| 		UCP_OTHER_PUNCTUATION,
 | |
| 		UCP_OPEN_PUNCTUATION,
 | |
| 		UCP_CURRENCY_SYMBOL,
 | |
| 		UCP_MODIFIER_SYMBOL,
 | |
| 		UCP_MATHEMATICAL_SYMBOL,
 | |
| 		UCP_OTHER_SYMBOL,
 | |
| 		UCP_LINE_SEPARATOR,
 | |
| 		UCP_PARAGRAPH_SEPARATOR,
 | |
| 		UCP_SPACE_SEPARATOR
 | |
| 	};
 | |
| 	
 | |
| 	enum Script
 | |
| 		/// Unicode 7.0 script identifiers.
 | |
| 	{
 | |
| 		UCP_ARABIC,
 | |
| 		UCP_ARMENIAN,
 | |
| 		UCP_BENGALI,
 | |
| 		UCP_BOPOMOFO,
 | |
| 		UCP_BRAILLE,
 | |
| 		UCP_BUGINESE,
 | |
| 		UCP_BUHID,
 | |
| 		UCP_CANADIAN_ABORIGINAL,
 | |
| 		UCP_CHEROKEE,
 | |
| 		UCP_COMMON,
 | |
| 		UCP_COPTIC,
 | |
| 		UCP_CYPRIOT,
 | |
| 		UCP_CYRILLIC,
 | |
| 		UCP_DESERET,
 | |
| 		UCP_DEVANAGARI,
 | |
| 		UCP_ETHIOPIC,
 | |
| 		UCP_GEORGIAN,
 | |
| 		UCP_GLAGOLITIC,
 | |
| 		UCP_GOTHIC,
 | |
| 		UCP_GREEK,
 | |
| 		UCP_GUJARATI,
 | |
| 		UCP_GURMUKHI,
 | |
| 		UCP_HAN,
 | |
| 		UCP_HANGUL,
 | |
| 		UCP_HANUNOO,
 | |
| 		UCP_HEBREW,
 | |
| 		UCP_HIRAGANA,
 | |
| 		UCP_INHERITED,
 | |
| 		UCP_KANNADA,
 | |
| 		UCP_KATAKANA,
 | |
| 		UCP_KHAROSHTHI,
 | |
| 		UCP_KHMER,
 | |
| 		UCP_LAO,
 | |
| 		UCP_LATIN,
 | |
| 		UCP_LIMBU,
 | |
| 		UCP_LINEAR_B,
 | |
| 		UCP_MALAYALAM,
 | |
| 		UCP_MONGOLIAN,
 | |
| 		UCP_MYANMAR,
 | |
| 		UCP_NEW_TAI_LUE,
 | |
| 		UCP_OGHAM,
 | |
| 		UCP_OLD_ITALIC,
 | |
| 		UCP_OLD_PERSIAN,
 | |
| 		UCP_ORIYA,
 | |
| 		UCP_OSMANYA,
 | |
| 		UCP_RUNIC,
 | |
| 		UCP_SHAVIAN,
 | |
| 		UCP_SINHALA,
 | |
| 		UCP_SYLOTI_NAGRI,
 | |
| 		UCP_SYRIAC,
 | |
| 		UCP_TAGALOG,
 | |
| 		UCP_TAGBANWA,
 | |
| 		UCP_TAI_LE,
 | |
| 		UCP_TAMIL,
 | |
| 		UCP_TELUGU,
 | |
| 		UCP_THAANA,
 | |
| 		UCP_THAI,
 | |
| 		UCP_TIBETAN,
 | |
| 		UCP_TIFINAGH,
 | |
| 		UCP_UGARITIC,
 | |
| 		UCP_YI,
 | |
| 		// Unicode 5.0
 | |
| 		UCP_BALINESE,
 | |
| 		UCP_CUNEIFORM,
 | |
| 		UCP_NKO,
 | |
| 		UCP_PHAGS_PA,
 | |
| 		UCP_PHOENICIAN,
 | |
| 		// Unicode 5.1
 | |
| 		UCP_CARIAN,
 | |
| 		UCP_CHAM,
 | |
| 		UCP_KAYAH_LI,
 | |
| 		UCP_LEPCHA,
 | |
| 		UCP_LYCIAN,
 | |
| 		UCP_LYDIAN,
 | |
| 		UCP_OL_CHIKI,
 | |
| 		UCP_REJANG,
 | |
| 		UCP_SAURASHTRA,
 | |
| 		UCP_SUNDANESE,
 | |
| 		UCP_VAI,
 | |
| 		// Unicode 5.2
 | |
| 		UCP_AVESTAN,
 | |
| 		UCP_BAMUM,
 | |
| 		UCP_EGYPTIAN_HIEROGLYPHS,
 | |
| 		UCP_IMPERIAL_ARAMAIC,
 | |
| 		UCP_INSCRIPTIONAL_PAHLAVI,
 | |
| 		UCP_INSCRIPTIONAL_PARTHIAN,
 | |
| 		UCP_JAVANESE,
 | |
| 		UCP_KAITHI,
 | |
| 		UCP_LISU,
 | |
| 		UCP_MEETEI_MAYEK,
 | |
| 		UCP_OLD_SOUTH_ARABIAN,
 | |
| 		UCP_OLD_TURKIC,
 | |
| 		UCP_SAMARITAN,
 | |
| 		UCP_TAI_THAM,
 | |
| 		UCP_TAI_VIET,
 | |
| 		// Unicode 6.0
 | |
| 		UCP_BATAK,
 | |
| 		UCP_BRAHMI,
 | |
| 		UCP_MANDAIC,
 | |
| 		// Unicode 6.1
 | |
| 		UCP_CHAKMA,
 | |
| 		UCP_MEROITIC_CURSIVE,
 | |
| 		UCP_MEROITIC_HIEROGLYPHS,
 | |
| 		UCP_MIAO,
 | |
| 		UCP_SHARADA,
 | |
| 		UCP_SORA_SOMPENG,
 | |
| 		UCP_TAKRI,
 | |
| 		// Unicode 7.0
 | |
| 		UCP_BASSA_VAH,
 | |
| 		UCP_CAUCASIAN_ALBANIAN,
 | |
| 		UCP_DUPLOYAN,
 | |
| 		UCP_ELBASAN,
 | |
| 		UCP_GRANTHA,
 | |
| 		UCP_KHOJKI,
 | |
| 		UCP_KHUDAWADI,
 | |
| 		UCP_LINEAR_A,
 | |
| 		UCP_MAHAJANI,
 | |
| 		UCP_MANICHAEAN,
 | |
| 		UCP_MENDE_KIKAKUI,
 | |
| 		UCP_MODI,
 | |
| 		UCP_MRO,
 | |
| 		UCP_NABATAEAN,
 | |
| 		UCP_OLD_NORTH_ARABIAN,
 | |
| 		UCP_OLD_PERMIC,
 | |
| 		UCP_PAHAWH_HMONG,
 | |
| 		UCP_PALMYRENE,
 | |
| 		UCP_PSALTER_PAHLAVI,
 | |
| 		UCP_PAU_CIN_HAU,
 | |
| 		UCP_SIDDHAM,
 | |
| 		UCP_TIRHUTA,
 | |
| 		UCP_WARANG_CITI
 | |
| 	};
 | |
| 	
 | |
| 	enum
 | |
| 	{
 | |
| 		UCP_MAX_CODEPOINT = 0x10FFFF
 | |
| 	};
 | |
| 	
 | |
| 	struct CharacterProperties
 | |
| 		/// This structure holds the character properties
 | |
| 		/// of an Unicode character.
 | |
| 	{
 | |
| 		CharacterCategory category;
 | |
| 		CharacterType     type;
 | |
| 		Script            script;
 | |
| 	};
 | |
| 
 | |
| 	static void properties(int ch, CharacterProperties& props);
 | |
| 		/// Return the Unicode character properties for the
 | |
| 		/// character with the given Unicode value.
 | |
| 		
 | |
| 	static bool isSpace(int ch);
 | |
| 		/// Returns true iff the given character is a separator.
 | |
| 		
 | |
| 	static bool isDigit(int ch);
 | |
| 		/// Returns true iff the given character is a numeric character.
 | |
| 		
 | |
| 	static bool isPunct(int ch);
 | |
| 		/// Returns true iff the given character is a punctuation character.
 | |
| 		
 | |
| 	static bool isAlpha(int ch);
 | |
| 		/// Returns true iff the given character is a letter.	
 | |
| 		
 | |
| 	static bool isLower(int ch);
 | |
| 		/// Returns true iff the given character is a lowercase
 | |
| 		/// character.
 | |
| 		
 | |
| 	static bool isUpper(int ch);
 | |
| 		/// Returns true iff the given character is an uppercase
 | |
| 		/// character.
 | |
| 		
 | |
| 	static int toLower(int ch);
 | |
| 		/// If the given character is an uppercase character,
 | |
| 		/// return its lowercase counterpart, otherwise return
 | |
| 		/// the character.
 | |
| 
 | |
| 	static int toUpper(int ch);
 | |
| 		/// If the given character is a lowercase character,
 | |
| 		/// return its uppercase counterpart, otherwise return
 | |
| 		/// the character.
 | |
| };
 | |
| 
 | |
| 
 | |
| //
 | |
| // inlines
 | |
| //
 | |
| inline bool Unicode::isSpace(int ch)
 | |
| {
 | |
| 	CharacterProperties props;
 | |
| 	properties(ch, props);
 | |
| 	return props.category == UCP_SEPARATOR;
 | |
| }
 | |
| 
 | |
| 
 | |
| inline bool Unicode::isDigit(int ch)
 | |
| {
 | |
| 	CharacterProperties props;
 | |
| 	properties(ch, props);
 | |
| 	return props.category == UCP_NUMBER;
 | |
| }
 | |
| 
 | |
| 
 | |
| inline bool Unicode::isPunct(int ch)
 | |
| {
 | |
| 	CharacterProperties props;
 | |
| 	properties(ch, props);
 | |
| 	return props.category == UCP_PUNCTUATION;
 | |
| }
 | |
| 
 | |
| 
 | |
| inline bool Unicode::isAlpha(int ch)
 | |
| {
 | |
| 	CharacterProperties props;
 | |
| 	properties(ch, props);
 | |
| 	return props.category == UCP_LETTER;
 | |
| }
 | |
| 
 | |
| 
 | |
| inline bool Unicode::isLower(int ch)
 | |
| {
 | |
| 	CharacterProperties props;
 | |
| 	properties(ch, props);
 | |
| 	return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
 | |
| }
 | |
| 
 | |
| 	
 | |
| inline bool Unicode::isUpper(int ch)
 | |
| {
 | |
| 	CharacterProperties props;
 | |
| 	properties(ch, props);
 | |
| 	return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
 | |
| }
 | |
| 
 | |
| 
 | |
| } // namespace Poco
 | |
| 
 | |
| 
 | |
| #endif // Foundation_Unicode_INCLUDED
 | 
