ClickHouse/base/poco/Foundation/include/Poco/Unicode.h

//
// Unicode.h
//
// Library: Foundation
// Package: Text
// Module:  Unicode
//
// Definition of the Unicode class.
//
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier:	BSL-1.0
//


#ifndef Foundation_Unicode_INCLUDED
#define Foundation_Unicode_INCLUDED


#include "Poco/Foundation.h"


namespace Poco {


class Foundation_API Unicode
	/// This class contains enumerations and static
	/// utility functions for dealing with Unicode characters
	/// and their properties.
	///
	/// For more information on Unicode, see <http://www.unicode.org>.
	///
	/// The implementation is based on the Unicode support
	/// functions in PCRE.
{
public:
	// Implementation note: the following definitions must be kept
	// in sync with those from ucp.h (PCRE).
	enum CharacterCategory
		/// Unicode character categories.
	{
		UCP_OTHER,
		UCP_LETTER,
		UCP_MARK,
		UCP_NUMBER,
		UCP_PUNCTUATION,
		UCP_SYMBOL,
		UCP_SEPARATOR
	};

	enum CharacterType
		/// Unicode character types.
	{
		UCP_CONTROL,
		UCP_FORMAT,
		UCP_UNASSIGNED,
		UCP_PRIVATE_USE,
		UCP_SURROGATE,
		UCP_LOWER_CASE_LETTER,
		UCP_MODIFIER_LETTER,
		UCP_OTHER_LETTER,
		UCP_TITLE_CASE_LETTER,
		UCP_UPPER_CASE_LETTER,
		UCP_SPACING_MARK,
		UCP_ENCLOSING_MARK,
		UCP_NON_SPACING_MARK,
		UCP_DECIMAL_NUMBER,
		UCP_LETTER_NUMBER,
		UCP_OTHER_NUMBER,
		UCP_CONNECTOR_PUNCTUATION,
		UCP_DASH_PUNCTUATION,
		UCP_CLOSE_PUNCTUATION,
		UCP_FINAL_PUNCTUATION,
		UCP_INITIAL_PUNCTUATION,
		UCP_OTHER_PUNCTUATION,
		UCP_OPEN_PUNCTUATION,
		UCP_CURRENCY_SYMBOL,
		UCP_MODIFIER_SYMBOL,
		UCP_MATHEMATICAL_SYMBOL,
		UCP_OTHER_SYMBOL,
		UCP_LINE_SEPARATOR,
		UCP_PARAGRAPH_SEPARATOR,
		UCP_SPACE_SEPARATOR
	};

	enum Script
		/// Unicode 7.0 script identifiers.
	{
		UCP_ARABIC,
		UCP_ARMENIAN,
		UCP_BENGALI,
		UCP_BOPOMOFO,
		UCP_BRAILLE,
		UCP_BUGINESE,
		UCP_BUHID,
		UCP_CANADIAN_ABORIGINAL,
		UCP_CHEROKEE,
		UCP_COMMON,
		UCP_COPTIC,
		UCP_CYPRIOT,
		UCP_CYRILLIC,
		UCP_DESERET,
		UCP_DEVANAGARI,
		UCP_ETHIOPIC,
		UCP_GEORGIAN,
		UCP_GLAGOLITIC,
		UCP_GOTHIC,
		UCP_GREEK,
		UCP_GUJARATI,
		UCP_GURMUKHI,
		UCP_HAN,
		UCP_HANGUL,
		UCP_HANUNOO,
		UCP_HEBREW,
		UCP_HIRAGANA,
		UCP_INHERITED,
		UCP_KANNADA,
		UCP_KATAKANA,
		UCP_KHAROSHTHI,
		UCP_KHMER,
		UCP_LAO,
		UCP_LATIN,
		UCP_LIMBU,
		UCP_LINEAR_B,
		UCP_MALAYALAM,
		UCP_MONGOLIAN,
		UCP_MYANMAR,
		UCP_NEW_TAI_LUE,
		UCP_OGHAM,
		UCP_OLD_ITALIC,
		UCP_OLD_PERSIAN,
		UCP_ORIYA,
		UCP_OSMANYA,
		UCP_RUNIC,
		UCP_SHAVIAN,
		UCP_SINHALA,
		UCP_SYLOTI_NAGRI,
		UCP_SYRIAC,
		UCP_TAGALOG,
		UCP_TAGBANWA,
		UCP_TAI_LE,
		UCP_TAMIL,
		UCP_TELUGU,
		UCP_THAANA,
		UCP_THAI,
		UCP_TIBETAN,
		UCP_TIFINAGH,
		UCP_UGARITIC,
		UCP_YI,
		// Unicode 5.0
		UCP_BALINESE,
		UCP_CUNEIFORM,
		UCP_NKO,
		UCP_PHAGS_PA,
		UCP_PHOENICIAN,
		// Unicode 5.1
		UCP_CARIAN,
		UCP_CHAM,
		UCP_KAYAH_LI,
		UCP_LEPCHA,
		UCP_LYCIAN,
		UCP_LYDIAN,
		UCP_OL_CHIKI,
		UCP_REJANG,
		UCP_SAURASHTRA,
		UCP_SUNDANESE,
		UCP_VAI,
		// Unicode 5.2
		UCP_AVESTAN,
		UCP_BAMUM,
		UCP_EGYPTIAN_HIEROGLYPHS,
		UCP_IMPERIAL_ARAMAIC,
		UCP_INSCRIPTIONAL_PAHLAVI,
		UCP_INSCRIPTIONAL_PARTHIAN,
		UCP_JAVANESE,
		UCP_KAITHI,
		UCP_LISU,
		UCP_MEETEI_MAYEK,
		UCP_OLD_SOUTH_ARABIAN,
		UCP_OLD_TURKIC,
		UCP_SAMARITAN,
		UCP_TAI_THAM,
		UCP_TAI_VIET,
		// Unicode 6.0
		UCP_BATAK,
		UCP_BRAHMI,
		UCP_MANDAIC,
		// Unicode 6.1
		UCP_CHAKMA,
		UCP_MEROITIC_CURSIVE,
		UCP_MEROITIC_HIEROGLYPHS,
		UCP_MIAO,
		UCP_SHARADA,
		UCP_SORA_SOMPENG,
		UCP_TAKRI,
		// Unicode 7.0
		UCP_BASSA_VAH,
		UCP_CAUCASIAN_ALBANIAN,
		UCP_DUPLOYAN,
		UCP_ELBASAN,
		UCP_GRANTHA,
		UCP_KHOJKI,
		UCP_KHUDAWADI,
		UCP_LINEAR_A,
		UCP_MAHAJANI,
		UCP_MANICHAEAN,
		UCP_MENDE_KIKAKUI,
		UCP_MODI,
		UCP_MRO,
		UCP_NABATAEAN,
		UCP_OLD_NORTH_ARABIAN,
		UCP_OLD_PERMIC,
		UCP_PAHAWH_HMONG,
		UCP_PALMYRENE,
		UCP_PSALTER_PAHLAVI,
		UCP_PAU_CIN_HAU,
		UCP_SIDDHAM,
		UCP_TIRHUTA,
		UCP_WARANG_CITI
	};

	enum
	{
		UCP_MAX_CODEPOINT = 0x10FFFF
	};

	struct CharacterProperties
		/// This structure holds the character properties
		/// of an Unicode character.
	{
		CharacterCategory category;
		CharacterType     type;
		Script            script;
	};

	static void properties(int ch, CharacterProperties& props);
		/// Return the Unicode character properties for the
		/// character with the given Unicode value.

	static bool isSpace(int ch);
		/// Returns true iff the given character is a separator.

	static bool isDigit(int ch);
		/// Returns true iff the given character is a numeric character.

	static bool isPunct(int ch);
		/// Returns true iff the given character is a punctuation character.

	static bool isAlpha(int ch);
		/// Returns true iff the given character is a letter.

	static bool isLower(int ch);
		/// Returns true iff the given character is a lowercase
		/// character.

	static bool isUpper(int ch);
		/// Returns true iff the given character is an uppercase
		/// character.

	static int toLower(int ch);
		/// If the given character is an uppercase character,
		/// return its lowercase counterpart, otherwise return
		/// the character.

	static int toUpper(int ch);
		/// If the given character is a lowercase character,
		/// return its uppercase counterpart, otherwise return
		/// the character.
};


//
// inlines
//
inline bool Unicode::isSpace(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_SEPARATOR;
}


inline bool Unicode::isDigit(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_NUMBER;
}


inline bool Unicode::isPunct(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_PUNCTUATION;
}


inline bool Unicode::isAlpha(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_LETTER;
}


inline bool Unicode::isLower(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
}


inline bool Unicode::isUpper(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
}


} // namespace Poco


#endif // Foundation_Unicode_INCLUDED