ClickHouse/base/poco/Foundation/include/Poco/Unicode.h

//
// Unicode.h
//
// Library: Foundation
// Package: Text
// Module:  Unicode
//
// Definition of the Unicode class.
//
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier:	BSL-1.0
//


#ifndef Foundation_Unicode_INCLUDED
#define Foundation_Unicode_INCLUDED


#include "Poco/Foundation.h"


namespace Poco
{


class Foundation_API Unicode
/// This class contains enumerations and static
/// utility functions for dealing with Unicode characters
/// and their properties.
///
/// For more information on Unicode, see <http://www.unicode.org>.
///
/// The implementation is based on the Unicode support
/// functions in PCRE.
{
public:
    // Implementation note: the following definitions must be kept
    // in sync with those from ucp.h (PCRE).
    enum CharacterCategory
    /// Unicode character categories.
    {
        UCP_OTHER,
        UCP_LETTER,
        UCP_MARK,
        UCP_NUMBER,
        UCP_PUNCTUATION,
        UCP_SYMBOL,
        UCP_SEPARATOR
    };

    enum CharacterType
    /// Unicode character types.
    {
        UCP_CONTROL,
        UCP_FORMAT,
        UCP_UNASSIGNED,
        UCP_PRIVATE_USE,
        UCP_SURROGATE,
        UCP_LOWER_CASE_LETTER,
        UCP_MODIFIER_LETTER,
        UCP_OTHER_LETTER,
        UCP_TITLE_CASE_LETTER,
        UCP_UPPER_CASE_LETTER,
        UCP_SPACING_MARK,
        UCP_ENCLOSING_MARK,
        UCP_NON_SPACING_MARK,
        UCP_DECIMAL_NUMBER,
        UCP_LETTER_NUMBER,
        UCP_OTHER_NUMBER,
        UCP_CONNECTOR_PUNCTUATION,
        UCP_DASH_PUNCTUATION,
        UCP_CLOSE_PUNCTUATION,
        UCP_FINAL_PUNCTUATION,
        UCP_INITIAL_PUNCTUATION,
        UCP_OTHER_PUNCTUATION,
        UCP_OPEN_PUNCTUATION,
        UCP_CURRENCY_SYMBOL,
        UCP_MODIFIER_SYMBOL,
        UCP_MATHEMATICAL_SYMBOL,
        UCP_OTHER_SYMBOL,
        UCP_LINE_SEPARATOR,
        UCP_PARAGRAPH_SEPARATOR,
        UCP_SPACE_SEPARATOR
    };

    enum Script
    /// Unicode 7.0 script identifiers.
    {
        UCP_ARABIC,
        UCP_ARMENIAN,
        UCP_BENGALI,
        UCP_BOPOMOFO,
        UCP_BRAILLE,
        UCP_BUGINESE,
        UCP_BUHID,
        UCP_CANADIAN_ABORIGINAL,
        UCP_CHEROKEE,
        UCP_COMMON,
        UCP_COPTIC,
        UCP_CYPRIOT,
        UCP_CYRILLIC,
        UCP_DESERET,
        UCP_DEVANAGARI,
        UCP_ETHIOPIC,
        UCP_GEORGIAN,
        UCP_GLAGOLITIC,
        UCP_GOTHIC,
        UCP_GREEK,
        UCP_GUJARATI,
        UCP_GURMUKHI,
        UCP_HAN,
        UCP_HANGUL,
        UCP_HANUNOO,
        UCP_HEBREW,
        UCP_HIRAGANA,
        UCP_INHERITED,
        UCP_KANNADA,
        UCP_KATAKANA,
        UCP_KHAROSHTHI,
        UCP_KHMER,
        UCP_LAO,
        UCP_LATIN,
        UCP_LIMBU,
        UCP_LINEAR_B,
        UCP_MALAYALAM,
        UCP_MONGOLIAN,
        UCP_MYANMAR,
        UCP_NEW_TAI_LUE,
        UCP_OGHAM,
        UCP_OLD_ITALIC,
        UCP_OLD_PERSIAN,
        UCP_ORIYA,
        UCP_OSMANYA,
        UCP_RUNIC,
        UCP_SHAVIAN,
        UCP_SINHALA,
        UCP_SYLOTI_NAGRI,
        UCP_SYRIAC,
        UCP_TAGALOG,
        UCP_TAGBANWA,
        UCP_TAI_LE,
        UCP_TAMIL,
        UCP_TELUGU,
        UCP_THAANA,
        UCP_THAI,
        UCP_TIBETAN,
        UCP_TIFINAGH,
        UCP_UGARITIC,
        UCP_YI,
        // Unicode 5.0
        UCP_BALINESE,
        UCP_CUNEIFORM,
        UCP_NKO,
        UCP_PHAGS_PA,
        UCP_PHOENICIAN,
        // Unicode 5.1
        UCP_CARIAN,
        UCP_CHAM,
        UCP_KAYAH_LI,
        UCP_LEPCHA,
        UCP_LYCIAN,
        UCP_LYDIAN,
        UCP_OL_CHIKI,
        UCP_REJANG,
        UCP_SAURASHTRA,
        UCP_SUNDANESE,
        UCP_VAI,
        // Unicode 5.2
        UCP_AVESTAN,
        UCP_BAMUM,
        UCP_EGYPTIAN_HIEROGLYPHS,
        UCP_IMPERIAL_ARAMAIC,
        UCP_INSCRIPTIONAL_PAHLAVI,
        UCP_INSCRIPTIONAL_PARTHIAN,
        UCP_JAVANESE,
        UCP_KAITHI,
        UCP_LISU,
        UCP_MEETEI_MAYEK,
        UCP_OLD_SOUTH_ARABIAN,
        UCP_OLD_TURKIC,
        UCP_SAMARITAN,
        UCP_TAI_THAM,
        UCP_TAI_VIET,
        // Unicode 6.0
        UCP_BATAK,
        UCP_BRAHMI,
        UCP_MANDAIC,
        // Unicode 6.1
        UCP_CHAKMA,
        UCP_MEROITIC_CURSIVE,
        UCP_MEROITIC_HIEROGLYPHS,
        UCP_MIAO,
        UCP_SHARADA,
        UCP_SORA_SOMPENG,
        UCP_TAKRI,
        // Unicode 7.0
        UCP_BASSA_VAH,
        UCP_CAUCASIAN_ALBANIAN,
        UCP_DUPLOYAN,
        UCP_ELBASAN,
        UCP_GRANTHA,
        UCP_KHOJKI,
        UCP_KHUDAWADI,
        UCP_LINEAR_A,
        UCP_MAHAJANI,
        UCP_MANICHAEAN,
        UCP_MENDE_KIKAKUI,
        UCP_MODI,
        UCP_MRO,
        UCP_NABATAEAN,
        UCP_OLD_NORTH_ARABIAN,
        UCP_OLD_PERMIC,
        UCP_PAHAWH_HMONG,
        UCP_PALMYRENE,
        UCP_PSALTER_PAHLAVI,
        UCP_PAU_CIN_HAU,
        UCP_SIDDHAM,
        UCP_TIRHUTA,
        UCP_WARANG_CITI
    };

    enum
    {
        UCP_MAX_CODEPOINT = 0x10FFFF
    };

    struct CharacterProperties
    /// This structure holds the character properties
    /// of an Unicode character.
    {
        CharacterCategory category;
        CharacterType type;
        Script script;
    };

    static void properties(int ch, CharacterProperties & props);
    /// Return the Unicode character properties for the
    /// character with the given Unicode value.

    static bool isSpace(int ch);
    /// Returns true iff the given character is a separator.

    static bool isDigit(int ch);
    /// Returns true iff the given character is a numeric character.

    static bool isPunct(int ch);
    /// Returns true iff the given character is a punctuation character.

    static bool isAlpha(int ch);
    /// Returns true iff the given character is a letter.

    static bool isLower(int ch);
    /// Returns true iff the given character is a lowercase
    /// character.

    static bool isUpper(int ch);
    /// Returns true iff the given character is an uppercase
    /// character.

    static int toLower(int ch);
    /// If the given character is an uppercase character,
    /// return its lowercase counterpart, otherwise return
    /// the character.

    static int toUpper(int ch);
    /// If the given character is a lowercase character,
    /// return its uppercase counterpart, otherwise return
    /// the character.
};


//
// inlines
//
inline bool Unicode::isSpace(int ch)
{
    CharacterProperties props;
    properties(ch, props);
    return props.category == UCP_SEPARATOR;
}


inline bool Unicode::isDigit(int ch)
{
    CharacterProperties props;
    properties(ch, props);
    return props.category == UCP_NUMBER;
}


inline bool Unicode::isPunct(int ch)
{
    CharacterProperties props;
    properties(ch, props);
    return props.category == UCP_PUNCTUATION;
}


inline bool Unicode::isAlpha(int ch)
{
    CharacterProperties props;
    properties(ch, props);
    return props.category == UCP_LETTER;
}


inline bool Unicode::isLower(int ch)
{
    CharacterProperties props;
    properties(ch, props);
    return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
}


inline bool Unicode::isUpper(int ch)
{
    CharacterProperties props;
    properties(ch, props);
    return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
}


} // namespace Poco


#endif // Foundation_Unicode_INCLUDED