mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-17 20:02:05 +00:00
229 lines
5.2 KiB
C++
229 lines
5.2 KiB
C++
/*************************************************
|
|
* Unicode Property Table handler *
|
|
*************************************************/
|
|
|
|
#ifndef _UCP_H
|
|
# define _UCP_H
|
|
|
|
/* This file contains definitions of the property values that are returned by
|
|
the UCD access macros. New values that are added for new releases of Unicode
|
|
should always be at the end of each enum, for backwards compatibility.
|
|
|
|
IMPORTANT: Note also that the specific numeric values of the enums have to be
|
|
the same as the values that are generated by the maint/MultiStage2.py script,
|
|
where the equivalent property descriptive names are listed in vectors.
|
|
|
|
ALSO: The specific values of the first two enums are assumed for the table
|
|
called catposstab in pcre_compile.c. */
|
|
|
|
/* These are the general character categories. */
|
|
|
|
enum
|
|
{
|
|
ucp_C, /* Other */
|
|
ucp_L, /* Letter */
|
|
ucp_M, /* Mark */
|
|
ucp_N, /* Number */
|
|
ucp_P, /* Punctuation */
|
|
ucp_S, /* Symbol */
|
|
ucp_Z /* Separator */
|
|
};
|
|
|
|
/* These are the particular character categories. */
|
|
|
|
enum
|
|
{
|
|
ucp_Cc, /* Control */
|
|
ucp_Cf, /* Format */
|
|
ucp_Cn, /* Unassigned */
|
|
ucp_Co, /* Private use */
|
|
ucp_Cs, /* Surrogate */
|
|
ucp_Ll, /* Lower case letter */
|
|
ucp_Lm, /* Modifier letter */
|
|
ucp_Lo, /* Other letter */
|
|
ucp_Lt, /* Title case letter */
|
|
ucp_Lu, /* Upper case letter */
|
|
ucp_Mc, /* Spacing mark */
|
|
ucp_Me, /* Enclosing mark */
|
|
ucp_Mn, /* Non-spacing mark */
|
|
ucp_Nd, /* Decimal number */
|
|
ucp_Nl, /* Letter number */
|
|
ucp_No, /* Other number */
|
|
ucp_Pc, /* Connector punctuation */
|
|
ucp_Pd, /* Dash punctuation */
|
|
ucp_Pe, /* Close punctuation */
|
|
ucp_Pf, /* Final punctuation */
|
|
ucp_Pi, /* Initial punctuation */
|
|
ucp_Po, /* Other punctuation */
|
|
ucp_Ps, /* Open punctuation */
|
|
ucp_Sc, /* Currency symbol */
|
|
ucp_Sk, /* Modifier symbol */
|
|
ucp_Sm, /* Mathematical symbol */
|
|
ucp_So, /* Other symbol */
|
|
ucp_Zl, /* Line separator */
|
|
ucp_Zp, /* Paragraph separator */
|
|
ucp_Zs /* Space separator */
|
|
};
|
|
|
|
/* These are grapheme break properties. Note that the code for processing them
|
|
assumes that the values are less than 16. If more values are added that take
|
|
the number to 16 or more, the code will have to be rewritten. */
|
|
|
|
enum
|
|
{
|
|
ucp_gbCR, /* 0 */
|
|
ucp_gbLF, /* 1 */
|
|
ucp_gbControl, /* 2 */
|
|
ucp_gbExtend, /* 3 */
|
|
ucp_gbPrepend, /* 4 */
|
|
ucp_gbSpacingMark, /* 5 */
|
|
ucp_gbL, /* 6 Hangul syllable type L */
|
|
ucp_gbV, /* 7 Hangul syllable type V */
|
|
ucp_gbT, /* 8 Hangul syllable type T */
|
|
ucp_gbLV, /* 9 Hangul syllable type LV */
|
|
ucp_gbLVT, /* 10 Hangul syllable type LVT */
|
|
ucp_gbRegionalIndicator, /* 11 */
|
|
ucp_gbOther /* 12 */
|
|
};
|
|
|
|
/* These are the script identifications. */
|
|
|
|
enum
|
|
{
|
|
ucp_Arabic,
|
|
ucp_Armenian,
|
|
ucp_Bengali,
|
|
ucp_Bopomofo,
|
|
ucp_Braille,
|
|
ucp_Buginese,
|
|
ucp_Buhid,
|
|
ucp_Canadian_Aboriginal,
|
|
ucp_Cherokee,
|
|
ucp_Common,
|
|
ucp_Coptic,
|
|
ucp_Cypriot,
|
|
ucp_Cyrillic,
|
|
ucp_Deseret,
|
|
ucp_Devanagari,
|
|
ucp_Ethiopic,
|
|
ucp_Georgian,
|
|
ucp_Glagolitic,
|
|
ucp_Gothic,
|
|
ucp_Greek,
|
|
ucp_Gujarati,
|
|
ucp_Gurmukhi,
|
|
ucp_Han,
|
|
ucp_Hangul,
|
|
ucp_Hanunoo,
|
|
ucp_Hebrew,
|
|
ucp_Hiragana,
|
|
ucp_Inherited,
|
|
ucp_Kannada,
|
|
ucp_Katakana,
|
|
ucp_Kharoshthi,
|
|
ucp_Khmer,
|
|
ucp_Lao,
|
|
ucp_Latin,
|
|
ucp_Limbu,
|
|
ucp_Linear_B,
|
|
ucp_Malayalam,
|
|
ucp_Mongolian,
|
|
ucp_Myanmar,
|
|
ucp_New_Tai_Lue,
|
|
ucp_Ogham,
|
|
ucp_Old_Italic,
|
|
ucp_Old_Persian,
|
|
ucp_Oriya,
|
|
ucp_Osmanya,
|
|
ucp_Runic,
|
|
ucp_Shavian,
|
|
ucp_Sinhala,
|
|
ucp_Syloti_Nagri,
|
|
ucp_Syriac,
|
|
ucp_Tagalog,
|
|
ucp_Tagbanwa,
|
|
ucp_Tai_Le,
|
|
ucp_Tamil,
|
|
ucp_Telugu,
|
|
ucp_Thaana,
|
|
ucp_Thai,
|
|
ucp_Tibetan,
|
|
ucp_Tifinagh,
|
|
ucp_Ugaritic,
|
|
ucp_Yi,
|
|
/* New for Unicode 5.0: */
|
|
ucp_Balinese,
|
|
ucp_Cuneiform,
|
|
ucp_Nko,
|
|
ucp_Phags_Pa,
|
|
ucp_Phoenician,
|
|
/* New for Unicode 5.1: */
|
|
ucp_Carian,
|
|
ucp_Cham,
|
|
ucp_Kayah_Li,
|
|
ucp_Lepcha,
|
|
ucp_Lycian,
|
|
ucp_Lydian,
|
|
ucp_Ol_Chiki,
|
|
ucp_Rejang,
|
|
ucp_Saurashtra,
|
|
ucp_Sundanese,
|
|
ucp_Vai,
|
|
/* New for Unicode 5.2: */
|
|
ucp_Avestan,
|
|
ucp_Bamum,
|
|
ucp_Egyptian_Hieroglyphs,
|
|
ucp_Imperial_Aramaic,
|
|
ucp_Inscriptional_Pahlavi,
|
|
ucp_Inscriptional_Parthian,
|
|
ucp_Javanese,
|
|
ucp_Kaithi,
|
|
ucp_Lisu,
|
|
ucp_Meetei_Mayek,
|
|
ucp_Old_South_Arabian,
|
|
ucp_Old_Turkic,
|
|
ucp_Samaritan,
|
|
ucp_Tai_Tham,
|
|
ucp_Tai_Viet,
|
|
/* New for Unicode 6.0.0: */
|
|
ucp_Batak,
|
|
ucp_Brahmi,
|
|
ucp_Mandaic,
|
|
/* New for Unicode 6.1.0: */
|
|
ucp_Chakma,
|
|
ucp_Meroitic_Cursive,
|
|
ucp_Meroitic_Hieroglyphs,
|
|
ucp_Miao,
|
|
ucp_Sharada,
|
|
ucp_Sora_Sompeng,
|
|
ucp_Takri,
|
|
/* New for Unicode 7.0.0: */
|
|
ucp_Bassa_Vah,
|
|
ucp_Caucasian_Albanian,
|
|
ucp_Duployan,
|
|
ucp_Elbasan,
|
|
ucp_Grantha,
|
|
ucp_Khojki,
|
|
ucp_Khudawadi,
|
|
ucp_Linear_A,
|
|
ucp_Mahajani,
|
|
ucp_Manichaean,
|
|
ucp_Mende_Kikakui,
|
|
ucp_Modi,
|
|
ucp_Mro,
|
|
ucp_Nabataean,
|
|
ucp_Old_North_Arabian,
|
|
ucp_Old_Permic,
|
|
ucp_Pahawh_Hmong,
|
|
ucp_Palmyrene,
|
|
ucp_Psalter_Pahlavi,
|
|
ucp_Pau_Cin_Hau,
|
|
ucp_Siddham,
|
|
ucp_Tirhuta,
|
|
ucp_Warang_Citi
|
|
};
|
|
|
|
#endif
|
|
|
|
/* End of ucp.h */
|