mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Fix collation locales
This commit is contained in:
parent
852e891499
commit
772bb0b70b
@ -4,6 +4,9 @@
|
||||
|
||||
#if USE_ICU
|
||||
#include <unicode/ucol.h>
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/locid.h>
|
||||
#include <unicode/ucnv.h>
|
||||
#else
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic ignored "-Wunused-private-field"
|
||||
@ -14,6 +17,7 @@
|
||||
#include <Common/Exception.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Poco/String.h>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -26,16 +30,75 @@ namespace DB
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<AvailableCollationLocales> AvailableCollationLocales::instance_impl;
|
||||
std::once_flag AvailableCollationLocales::init_flag;
|
||||
|
||||
void AvailableCollationLocales::init()
|
||||
{
|
||||
instance_impl = std::make_unique<AvailableCollationLocales>();
|
||||
#if USE_ICU
|
||||
size_t available_locales_count = ucol_countAvailable();
|
||||
for (size_t i = 0; i < available_locales_count; ++i)
|
||||
{
|
||||
std::string locale_name = ucol_getAvailable(i);
|
||||
UChar lang_buffer[128]; /// 128 is enough for language name
|
||||
char normal_buf[128];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
/// All names will be in English language
|
||||
size_t lang_length = uloc_getDisplayLanguage(locale_name.c_str(), "en", lang_buffer, 128, &status);
|
||||
if (U_FAILURE(status))
|
||||
instance_impl->available_collation_locales.push_back(LocaleAndLanguage{locale_name, "unknown"});
|
||||
else
|
||||
{
|
||||
u_UCharsToChars(lang_buffer, normal_buf, lang_length);
|
||||
LocaleAndLanguage result{locale_name, std::string(normal_buf, lang_length)};
|
||||
instance_impl->available_collation_locales.push_back(result);
|
||||
}
|
||||
}
|
||||
|
||||
auto comparator = [] (const LocaleAndLanguage & f, const LocaleAndLanguage & s) { return f.locale_name < s.locale_name; };
|
||||
std::sort(instance_impl->available_collation_locales.begin(), instance_impl->available_collation_locales.end(), comparator);
|
||||
#endif
|
||||
}
|
||||
|
||||
AvailableCollationLocales & AvailableCollationLocales::instance()
|
||||
{
|
||||
std::call_once(init_flag, AvailableCollationLocales::init);
|
||||
return *instance_impl;
|
||||
}
|
||||
|
||||
const std::vector<AvailableCollationLocales::LocaleAndLanguage> & AvailableCollationLocales::getAvailableCollations() const
|
||||
{
|
||||
return available_collation_locales;
|
||||
}
|
||||
|
||||
bool AvailableCollationLocales::isCollationSupported(const std::string & s) const
|
||||
{
|
||||
std::string lower = Poco::toLower(s);
|
||||
for (const auto & locale_and_lang : available_collation_locales)
|
||||
{
|
||||
if (lower == Poco::toLower(locale_and_lang.locale_name))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Collator::Collator(const std::string & locale_) : locale(Poco::toLower(locale_))
|
||||
{
|
||||
#if USE_ICU
|
||||
/// We check it here, because ucol_open will fallback to default locale for
|
||||
/// almost all random names.
|
||||
if (!AvailableCollationLocales::instance().isCollationSupported(locale))
|
||||
throw DB::Exception("Unsupported collation locale: " + locale, DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
collator = ucol_open(locale.c_str(), &status);
|
||||
if (status != U_ZERO_ERROR)
|
||||
if (U_FAILURE(status))
|
||||
{
|
||||
ucol_close(collator);
|
||||
throw DB::Exception("Unsupported collation locale: " + locale, DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
|
||||
throw DB::Exception("Failed to open locale: " + locale + " with error: " + u_errorName(status), DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
|
||||
}
|
||||
#else
|
||||
throw DB::Exception("Collations support is disabled, because ClickHouse was built without ICU library", DB::ErrorCodes::SUPPORT_IS_DISABLED);
|
||||
@ -60,8 +123,8 @@ int Collator::compare(const char * str1, size_t length1, const char * str2, size
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);
|
||||
|
||||
if (status != U_ZERO_ERROR)
|
||||
throw DB::Exception("ICU collation comparison failed with error code: " + DB::toString<int>(status),
|
||||
if (U_FAILURE(status))
|
||||
throw DB::Exception("ICU collation comparison failed with error code: " + std::string(u_errorName(status)),
|
||||
DB::ErrorCodes::COLLATION_COMPARISON_FAILED);
|
||||
|
||||
/** Values of enum UCollationResult are equals to what exactly we need:
|
||||
@ -83,14 +146,3 @@ const std::string & Collator::getLocale() const
|
||||
{
|
||||
return locale;
|
||||
}
|
||||
|
||||
std::vector<std::string> Collator::getAvailableCollations()
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
#if USE_ICU
|
||||
size_t available_locales_count = ucol_countAvailable();
|
||||
for (size_t i = 0; i < available_locales_count; ++i)
|
||||
result.push_back(ucol_getAvailable(i));
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
@ -3,9 +3,38 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <boost/noncopyable.hpp>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
struct UCollator;
|
||||
|
||||
/// Class represents available locales for collations.
|
||||
class AvailableCollationLocales : private boost::noncopyable
|
||||
{
|
||||
public:
|
||||
|
||||
struct LocaleAndLanguage
|
||||
{
|
||||
std::string locale_name; /// in ISO format
|
||||
std::string language; /// in English
|
||||
};
|
||||
|
||||
static AvailableCollationLocales & instance();
|
||||
|
||||
/// Get all collations with names
|
||||
const std::vector<LocaleAndLanguage> & getAvailableCollations() const;
|
||||
|
||||
/// Check that collation is supported
|
||||
bool isCollationSupported(const std::string & s) const;
|
||||
|
||||
private:
|
||||
static std::once_flag init_flag;
|
||||
static std::unique_ptr<AvailableCollationLocales> instance_impl;
|
||||
static void init();
|
||||
private:
|
||||
std::vector<LocaleAndLanguage> available_collation_locales;
|
||||
};
|
||||
|
||||
class Collator : private boost::noncopyable
|
||||
{
|
||||
public:
|
||||
@ -15,10 +44,8 @@ public:
|
||||
int compare(const char * str1, size_t length1, const char * str2, size_t length2) const;
|
||||
|
||||
const std::string & getLocale() const;
|
||||
|
||||
static std::vector<std::string> getAvailableCollations();
|
||||
|
||||
private:
|
||||
|
||||
std::string locale;
|
||||
UCollator * collator;
|
||||
};
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <Interpreters/sortBlock.h>
|
||||
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
|
||||
#include <pdqsort.h>
|
||||
@ -14,15 +15,21 @@ namespace ErrorCodes
|
||||
}
|
||||
|
||||
|
||||
static inline bool needCollation(const IColumn * column, const SortColumnDescription & description)
|
||||
static inline const IColumn * needCollation(const IColumn * column, const SortColumnDescription & description)
|
||||
{
|
||||
if (!description.collator)
|
||||
return false;
|
||||
return nullptr;
|
||||
|
||||
if (!typeid_cast<const ColumnString *>(column)) /// TODO Nullable(String)
|
||||
throw Exception("Collations could be specified only for String columns.", ErrorCodes::BAD_COLLATION);
|
||||
auto column_result = column;
|
||||
if (auto const_column = typeid_cast<const ColumnConst *>(column))
|
||||
column_result = &const_column->getDataColumn();
|
||||
|
||||
return true;
|
||||
if (typeid_cast<const ColumnString *>(column_result))
|
||||
return column_result;
|
||||
|
||||
/// TODO Nullable(String)
|
||||
|
||||
throw Exception("Collations could be specified only for String columns.", ErrorCodes::BAD_COLLATION);
|
||||
}
|
||||
|
||||
|
||||
@ -77,9 +84,9 @@ struct PartialSortingLessWithCollation
|
||||
for (ColumnsWithSortDescriptions::const_iterator it = columns.begin(); it != columns.end(); ++it)
|
||||
{
|
||||
int res;
|
||||
if (needCollation(it->first, it->second))
|
||||
if (auto column_string_ptr = needCollation(it->first, it->second))
|
||||
{
|
||||
const ColumnString & column_string = typeid_cast<const ColumnString &>(*it->first);
|
||||
const ColumnString & column_string = typeid_cast<const ColumnString &>(*column_string_ptr);
|
||||
res = column_string.compareAtWithCollation(a, b, *it->first, *it->second.collator);
|
||||
}
|
||||
else
|
||||
@ -110,9 +117,9 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
|
||||
: block.safeGetByPosition(description[0].column_number).column.get();
|
||||
|
||||
IColumn::Permutation perm;
|
||||
if (needCollation(column, description[0]))
|
||||
if (auto column_string_ptr = needCollation(column, description[0]))
|
||||
{
|
||||
const ColumnString & column_string = typeid_cast<const ColumnString &>(*column);
|
||||
const ColumnString & column_string = typeid_cast<const ColumnString &>(*column_string_ptr);
|
||||
column_string.getPermutationWithCollation(*description[0].collator, reverse, limit, perm);
|
||||
}
|
||||
else
|
||||
|
@ -8,13 +8,17 @@ NamesAndTypesList StorageSystemCollations::getNamesAndTypes()
|
||||
{
|
||||
return {
|
||||
{"name", std::make_shared<DataTypeString>()},
|
||||
{"language", std::make_shared<DataTypeString>()},
|
||||
};
|
||||
}
|
||||
|
||||
void StorageSystemCollations::fillData(MutableColumns & res_columns, const Context &, const SelectQueryInfo &) const
|
||||
{
|
||||
for (const auto & collation_name : Collator::getAvailableCollations())
|
||||
res_columns[0]->insert(collation_name);
|
||||
for (const auto & [locale, lang]: AvailableCollationLocales::instance().getAvailableCollations())
|
||||
{
|
||||
res_columns[0]->insert(locale);
|
||||
res_columns[1]->insert(lang);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,15 +1,18 @@
|
||||
Русский (default)
|
||||
Ё
|
||||
А
|
||||
Я
|
||||
а
|
||||
я
|
||||
ё
|
||||
Русский (ru)
|
||||
а
|
||||
А
|
||||
ё
|
||||
Ё
|
||||
я
|
||||
Я
|
||||
Русский (ru distributed)
|
||||
а
|
||||
а
|
||||
А
|
||||
@ -22,6 +25,7 @@
|
||||
я
|
||||
Я
|
||||
Я
|
||||
Türk (default)
|
||||
A
|
||||
A
|
||||
B
|
||||
@ -132,6 +136,7 @@ z
|
||||
ı
|
||||
Ş
|
||||
ş
|
||||
Türk (tr)
|
||||
a
|
||||
a
|
||||
A
|
||||
@ -242,9 +247,49 @@ z
|
||||
z
|
||||
Z
|
||||
Z
|
||||
english (default)
|
||||
A
|
||||
Q
|
||||
Z
|
||||
c
|
||||
e
|
||||
english (en_US)
|
||||
A
|
||||
c
|
||||
e
|
||||
Q
|
||||
Z
|
||||
english (en)
|
||||
A
|
||||
c
|
||||
e
|
||||
Q
|
||||
Z
|
||||
español (default)
|
||||
F
|
||||
J
|
||||
z
|
||||
Ñ
|
||||
español (es)
|
||||
F
|
||||
J
|
||||
Ñ
|
||||
z
|
||||
Український (default)
|
||||
І
|
||||
Б
|
||||
ї
|
||||
ґ
|
||||
Український (uk)
|
||||
Б
|
||||
ґ
|
||||
І
|
||||
ї
|
||||
Русский (ru group by)
|
||||
а 1
|
||||
А 4
|
||||
ё 3
|
||||
Ё 6
|
||||
я 2
|
||||
Я 5
|
||||
ζ
|
||||
|
@ -1,6 +1,44 @@
|
||||
SELECT 'Русский (default)';
|
||||
SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x ORDER BY x;
|
||||
|
||||
SELECT 'Русский (ru)';
|
||||
SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x ORDER BY x COLLATE 'ru';
|
||||
|
||||
SELECT 'Русский (ru distributed)';
|
||||
SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x FROM remote('127.0.0.{2,3}', system, one) ORDER BY x COLLATE 'ru';
|
||||
|
||||
SELECT 'Türk (default)';
|
||||
SELECT arrayJoin(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', 'A', 'B', 'C', 'Ç', 'D', 'E', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K', 'L', 'M', 'N', 'O', 'Ö', 'P', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'Y', 'Z']) AS x ORDER BY x;
|
||||
|
||||
SELECT 'Türk (tr)';
|
||||
SELECT arrayJoin(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', 'A', 'B', 'C', 'Ç', 'D', 'E', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K', 'L', 'M', 'N', 'O', 'Ö', 'P', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'Y', 'Z']) AS x ORDER BY x COLLATE 'tr';
|
||||
|
||||
SELECT 'english (default)';
|
||||
SELECT arrayJoin(['A', 'c', 'Z', 'Q', 'e']) AS x ORDER BY x;
|
||||
SELECT 'english (en_US)';
|
||||
SELECT arrayJoin(['A', 'c', 'Z', 'Q', 'e']) AS x ORDER BY x COLLATE 'en_US';
|
||||
SELECT 'english (en)';
|
||||
SELECT arrayJoin(['A', 'c', 'Z', 'Q', 'e']) AS x ORDER BY x COLLATE 'en';
|
||||
|
||||
SELECT 'español (default)';
|
||||
SELECT arrayJoin(['F', 'z', 'J', 'Ñ']) as x ORDER BY x;
|
||||
SELECT 'español (es)';
|
||||
SELECT arrayJoin(['F', 'z', 'J', 'Ñ']) as x ORDER BY x COLLATE 'es';
|
||||
|
||||
SELECT 'Український (default)';
|
||||
SELECT arrayJoin(['ґ', 'ї', 'І', 'Б']) as x ORDER BY x;
|
||||
SELECT 'Український (uk)';
|
||||
SELECT arrayJoin(['ґ', 'ї', 'І', 'Б']) as x ORDER BY x COLLATE 'uk';
|
||||
|
||||
SELECT 'Русский (ru group by)';
|
||||
SELECT x, n FROM (SELECT ['а', 'я', 'ё', 'А', 'Я', 'Ё'] AS arr) ARRAY JOIN arr AS x, arrayEnumerate(arr) AS n ORDER BY x COLLATE 'ru', n;
|
||||
|
||||
--- Const expression
|
||||
SELECT 'ζ' as x ORDER BY x COLLATE 'el';
|
||||
|
||||
|
||||
--- Trash locales
|
||||
SELECT '' as x ORDER BY x COLLATE 'qq'; --{serverError 186}
|
||||
SELECT '' as x ORDER BY x COLLATE 'qwe'; --{serverError 186}
|
||||
SELECT '' as x ORDER BY x COLLATE 'some_non_existing_locale'; --{serverError 186}
|
||||
SELECT '' as x ORDER BY x COLLATE 'ру'; --{serverError 186}
|
||||
|
Loading…
Reference in New Issue
Block a user