ClickHouse/contrib/libpoco/Foundation/include/Poco/Unicode.h

//
// Unicode.h
//
// $Id: //poco/1.4/Foundation/include/Poco/Unicode.h#2 $
//
// Library: Foundation
// Package: Text
// Module:  Unicode
//
// Definition of the Unicode class.
//
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier:	BSL-1.0
//


#ifndef Foundation_Unicode_INCLUDED
#define Foundation_Unicode_INCLUDED


#include "Poco/Foundation.h"


namespace Poco {


class Foundation_API Unicode
	/// This class contains enumerations and static
	/// utility functions for dealing with Unicode characters
	/// and their properties.
	///
	/// For more information on Unicode, see <http://www.unicode.org>.
	///
	/// The implementation is based on the Unicode support
	/// functions in PCRE.
{
public:
	// Implementation note: the following definitions must be kept
	// in sync with those from ucp.h (PCRE).
	enum CharacterCategory
		/// Unicode 5.0 character categories.
	{
		UCP_OTHER,
		UCP_LETTER,
		UCP_MARK,
		UCP_NUMBER,
		UCP_PUNCTUATION,
		UCP_SYMBOL,
		UCP_SEPARATOR
	};

	enum CharacterType
		/// Unicode 5.0 character types.
	{
		UCP_CONTROL,
		UCP_FORMAT,
		UCP_UNASSIGNED,
		UCP_PRIVATE_USE,
		UCP_SURROGATE,
		UCP_LOWER_CASE_LETTER,
		UCP_MODIFIER_LETTER,
		UCP_OTHER_LETTER,
		UCP_TITLE_CASE_LETTER,
		UCP_UPPER_CASE_LETTER,
		UCP_SPACING_MARK,
		UCP_ENCLOSING_MARK,
		UCP_NON_SPACING_MARK,
		UCP_DECIMAL_NUMBER,
		UCP_LETTER_NUMBER,
		UCP_OTHER_NUMBER,
		UCP_CONNECTOR_PUNCTUATION,
		UCP_DASH_PUNCTUATION,
		UCP_CLOSE_PUNCTUATION,
		UCP_FINAL_PUNCTUATION,
		UCP_INITIAL_PUNCTUATION,
		UCP_OTHER_PUNCTUATION,
		UCP_OPEN_PUNCTUATION,
		UCP_CURRENCY_SYMBOL,
		UCP_MODIFIER_SYMBOL,
		UCP_MATHEMATICAL_SYMBOL,
		UCP_OTHER_SYMBOL,
		UCP_LINE_SEPARATOR,
		UCP_PARAGRAPH_SEPARATOR,
		UCP_SPACE_SEPARATOR
	};
	
	enum Script
		/// Unicode 5.0 scripts.
	{
		UCP_ARABIC,
		UCP_ARMENIAN,
		UCP_BENGALI,
		UCP_BOPOMOFO,
		UCP_BRAILLE,
		UCP_BUGINESE,
		UCP_BUHID,
		UCP_CANADIAN_ABORIGINAL,
		UCP_CHEROKEE,
		UCP_COMMON,
		UCP_COPTIC,
		UCP_CYPRIOT,
		UCP_CYRILLIC,
		UCP_DESERET,
		UCP_DEVANAGARI,
		UCP_ETHIOPIC,
		UCP_GEORGIAN,
		UCP_GLAGOLITIC,
		UCP_GOTHIC,
		UCP_GREEK,
		UCP_GUJARATI,
		UCP_GURMUKHI,
		UCP_HAN,
		UCP_HANGUL,
		UCP_HANUNOO,
		UCP_HEBREW,
		UCP_HIRAGANA,
		UCP_INHERITED,
		UCP_KANNADA,
		UCP_KATAKANA,
		UCP_KHAROSHTHI,
		UCP_KHMER,
		UCP_LAO,
		UCP_LATIN,
		UCP_LIMBU,
		UCP_LINEAR_B,
		UCP_MALAYALAM,
		UCP_MONGOLIAN,
		UCP_MYANMAR,
		UCP_NEW_TAI_LUE,
		UCP_OGHAM,
		UCP_OLD_ITALIC,
		UCP_OLD_PERSIAN,
		UCP_ORIYA,
		UCP_OSMANYA,
		UCP_RUNIC,
		UCP_SHAVIAN,
		UCP_SINHALA,
		UCP_SYLOTI_NAGRI,
		UCP_SYRIAC,
		UCP_TAGALOG,
		UCP_TAGBANWA,
		UCP_TAI_LE,
		UCP_TAMIL,
		UCP_TELUGU,
		UCP_THAANA,
		UCP_THAI,
		UCP_TIBETAN,
		UCP_TIFINAGH,
		UCP_UGARITIC,
		UCP_YI,
		UCP_BALINESE,
		UCP_CUNEIFORM,
		UCP_NKO,
		UCP_PHAGS_PA,
		UCP_PHOENICIAN,
		UCP_CARIAN,
		UCP_CHAM,
		UCP_KAYAH_LI,
		UCP_LEPCHA,
		UCP_LYCIAN,
		UCP_LYDIAN,
		UCP_OL_CHIKI,
		UCP_REJANG,
		UCP_SAURASHTRA,
		UCP_SUNDANESE,
		UCP_VAI
	};
	
	enum
	{
		UCP_MAX_CODEPOINT = 0x10FFFF
	};
	
	struct CharacterProperties
		/// This structure holds the character properties
		/// of an Unicode character.
	{
		CharacterCategory category;
		CharacterType     type;
		Script            script;
	};

	static void properties(int ch, CharacterProperties& props);
		/// Return the Unicode character properties for the
		/// character with the given Unicode value.
		
	static bool isSpace(int ch);
		/// Returns true iff the given character is a separator.
		
	static bool isDigit(int ch);
		/// Returns true iff the given character is a numeric character.
		
	static bool isPunct(int ch);
		/// Returns true iff the given character is a punctuation character.
		
	static bool isAlpha(int ch);
		/// Returns true iff the given character is a letter.	
		
	static bool isLower(int ch);
		/// Returns true iff the given character is a lowercase
		/// character.
		
	static bool isUpper(int ch);
		/// Returns true iff the given character is an uppercase
		/// character.
		
	static int toLower(int ch);
		/// If the given character is an uppercase character,
		/// return its lowercase counterpart, otherwise return
		/// the character.

	static int toUpper(int ch);
		/// If the given character is a lowercase character,
		/// return its uppercase counterpart, otherwise return
		/// the character.
};


//
// inlines
//
inline bool Unicode::isSpace(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_SEPARATOR;
}


inline bool Unicode::isDigit(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_NUMBER;
}


inline bool Unicode::isPunct(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_PUNCTUATION;
}


inline bool Unicode::isAlpha(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_LETTER;
}


inline bool Unicode::isLower(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
}

	
inline bool Unicode::isUpper(int ch)
{
	CharacterProperties props;
	properties(ch, props);
	return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
}


} // namespace Poco


#endif // Foundation_Unicode_INCLUDED
Added Poco library to tree [#METR-19593]. 2016-01-06 22:53:39 +00:00			`//`
			`// Unicode.h`
			`//`
			`// $Id: //poco/1.4/Foundation/include/Poco/Unicode.h#2 $`
			`//`
			`// Library: Foundation`
			`// Package: Text`
			`// Module: Unicode`
			`//`
			`// Definition of the Unicode class.`
			`//`
			`// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.`
			`// and Contributors.`
			`//`
			`// SPDX-License-Identifier: BSL-1.0`
			`//`


			`#ifndef Foundation_Unicode_INCLUDED`
			`#define Foundation_Unicode_INCLUDED`


			`#include "Poco/Foundation.h"`


			`namespace Poco {`


			`class Foundation_API Unicode`
			`/// This class contains enumerations and static`
			`/// utility functions for dealing with Unicode characters`
			`/// and their properties.`
			`///`
			`/// For more information on Unicode, see <http://www.unicode.org>.`
			`///`
			`/// The implementation is based on the Unicode support`
			`/// functions in PCRE.`
			`{`
			`public:`
			`// Implementation note: the following definitions must be kept`
			`// in sync with those from ucp.h (PCRE).`
			`enum CharacterCategory`
			`/// Unicode 5.0 character categories.`
			`{`
			`UCP_OTHER,`
			`UCP_LETTER,`
			`UCP_MARK,`
			`UCP_NUMBER,`
			`UCP_PUNCTUATION,`
			`UCP_SYMBOL,`
			`UCP_SEPARATOR`
			`};`

			`enum CharacterType`
			`/// Unicode 5.0 character types.`
			`{`
			`UCP_CONTROL,`
			`UCP_FORMAT,`
			`UCP_UNASSIGNED,`
			`UCP_PRIVATE_USE,`
			`UCP_SURROGATE,`
			`UCP_LOWER_CASE_LETTER,`
			`UCP_MODIFIER_LETTER,`
			`UCP_OTHER_LETTER,`
			`UCP_TITLE_CASE_LETTER,`
			`UCP_UPPER_CASE_LETTER,`
			`UCP_SPACING_MARK,`
			`UCP_ENCLOSING_MARK,`
			`UCP_NON_SPACING_MARK,`
			`UCP_DECIMAL_NUMBER,`
			`UCP_LETTER_NUMBER,`
			`UCP_OTHER_NUMBER,`
			`UCP_CONNECTOR_PUNCTUATION,`
			`UCP_DASH_PUNCTUATION,`
			`UCP_CLOSE_PUNCTUATION,`
			`UCP_FINAL_PUNCTUATION,`
			`UCP_INITIAL_PUNCTUATION,`
			`UCP_OTHER_PUNCTUATION,`
			`UCP_OPEN_PUNCTUATION,`
			`UCP_CURRENCY_SYMBOL,`
			`UCP_MODIFIER_SYMBOL,`
			`UCP_MATHEMATICAL_SYMBOL,`
			`UCP_OTHER_SYMBOL,`
			`UCP_LINE_SEPARATOR,`
			`UCP_PARAGRAPH_SEPARATOR,`
			`UCP_SPACE_SEPARATOR`
			`};`

			`enum Script`
			`/// Unicode 5.0 scripts.`
			`{`
			`UCP_ARABIC,`
			`UCP_ARMENIAN,`
			`UCP_BENGALI,`
			`UCP_BOPOMOFO,`
			`UCP_BRAILLE,`
			`UCP_BUGINESE,`
			`UCP_BUHID,`
			`UCP_CANADIAN_ABORIGINAL,`
			`UCP_CHEROKEE,`
			`UCP_COMMON,`
			`UCP_COPTIC,`
			`UCP_CYPRIOT,`
			`UCP_CYRILLIC,`
			`UCP_DESERET,`
			`UCP_DEVANAGARI,`
			`UCP_ETHIOPIC,`
			`UCP_GEORGIAN,`
			`UCP_GLAGOLITIC,`
			`UCP_GOTHIC,`
			`UCP_GREEK,`
			`UCP_GUJARATI,`
			`UCP_GURMUKHI,`
			`UCP_HAN,`
			`UCP_HANGUL,`
			`UCP_HANUNOO,`
			`UCP_HEBREW,`
			`UCP_HIRAGANA,`
			`UCP_INHERITED,`
			`UCP_KANNADA,`
			`UCP_KATAKANA,`
			`UCP_KHAROSHTHI,`
			`UCP_KHMER,`
			`UCP_LAO,`
			`UCP_LATIN,`
			`UCP_LIMBU,`
			`UCP_LINEAR_B,`
			`UCP_MALAYALAM,`
			`UCP_MONGOLIAN,`
			`UCP_MYANMAR,`
			`UCP_NEW_TAI_LUE,`
			`UCP_OGHAM,`
			`UCP_OLD_ITALIC,`
			`UCP_OLD_PERSIAN,`
			`UCP_ORIYA,`
			`UCP_OSMANYA,`
			`UCP_RUNIC,`
			`UCP_SHAVIAN,`
			`UCP_SINHALA,`
			`UCP_SYLOTI_NAGRI,`
			`UCP_SYRIAC,`
			`UCP_TAGALOG,`
			`UCP_TAGBANWA,`
			`UCP_TAI_LE,`
			`UCP_TAMIL,`
			`UCP_TELUGU,`
			`UCP_THAANA,`
			`UCP_THAI,`
			`UCP_TIBETAN,`
			`UCP_TIFINAGH,`
			`UCP_UGARITIC,`
			`UCP_YI,`
			`UCP_BALINESE,`
			`UCP_CUNEIFORM,`
			`UCP_NKO,`
			`UCP_PHAGS_PA,`
			`UCP_PHOENICIAN,`
			`UCP_CARIAN,`
			`UCP_CHAM,`
			`UCP_KAYAH_LI,`
			`UCP_LEPCHA,`
			`UCP_LYCIAN,`
			`UCP_LYDIAN,`
			`UCP_OL_CHIKI,`
			`UCP_REJANG,`
			`UCP_SAURASHTRA,`
			`UCP_SUNDANESE,`
			`UCP_VAI`
			`};`

			`enum`
			`{`
			`UCP_MAX_CODEPOINT = 0x10FFFF`
			`};`

			`struct CharacterProperties`
			`/// This structure holds the character properties`
			`/// of an Unicode character.`
			`{`
			`CharacterCategory category;`
			`CharacterType type;`
			`Script script;`
			`};`

			`static void properties(int ch, CharacterProperties& props);`
			`/// Return the Unicode character properties for the`
			`/// character with the given Unicode value.`

			`static bool isSpace(int ch);`
			`/// Returns true iff the given character is a separator.`

			`static bool isDigit(int ch);`
			`/// Returns true iff the given character is a numeric character.`

			`static bool isPunct(int ch);`
			`/// Returns true iff the given character is a punctuation character.`

			`static bool isAlpha(int ch);`
			`/// Returns true iff the given character is a letter.`

			`static bool isLower(int ch);`
			`/// Returns true iff the given character is a lowercase`
			`/// character.`

			`static bool isUpper(int ch);`
			`/// Returns true iff the given character is an uppercase`
			`/// character.`

			`static int toLower(int ch);`
			`/// If the given character is an uppercase character,`
			`/// return its lowercase counterpart, otherwise return`
			`/// the character.`

			`static int toUpper(int ch);`
			`/// If the given character is a lowercase character,`
			`/// return its uppercase counterpart, otherwise return`
			`/// the character.`
			`};`


			`//`
			`// inlines`
			`//`
			`inline bool Unicode::isSpace(int ch)`
			`{`
			`CharacterProperties props;`
			`properties(ch, props);`
			`return props.category == UCP_SEPARATOR;`
			`}`


			`inline bool Unicode::isDigit(int ch)`
			`{`
			`CharacterProperties props;`
			`properties(ch, props);`
			`return props.category == UCP_NUMBER;`
			`}`


			`inline bool Unicode::isPunct(int ch)`
			`{`
			`CharacterProperties props;`
			`properties(ch, props);`
			`return props.category == UCP_PUNCTUATION;`
			`}`


			`inline bool Unicode::isAlpha(int ch)`
			`{`
			`CharacterProperties props;`
			`properties(ch, props);`
			`return props.category == UCP_LETTER;`
			`}`


			`inline bool Unicode::isLower(int ch)`
			`{`
			`CharacterProperties props;`
			`properties(ch, props);`
			`return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;`
			`}`


			`inline bool Unicode::isUpper(int ch)`
			`{`
			`CharacterProperties props;`
			`properties(ch, props);`
			`return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;`
			`}`


			`} // namespace Poco`


			`#endif // Foundation_Unicode_INCLUDED`