Fix collation locales

This commit is contained in:
alesapin 2019-12-05 18:31:45 +03:00
parent 852e891499
commit 772bb0b70b
6 changed files with 202 additions and 29 deletions

View File

@ -4,6 +4,9 @@
#if USE_ICU #if USE_ICU
#include <unicode/ucol.h> #include <unicode/ucol.h>
#include <unicode/unistr.h>
#include <unicode/locid.h>
#include <unicode/ucnv.h>
#else #else
#ifdef __clang__ #ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-private-field" #pragma clang diagnostic ignored "-Wunused-private-field"
@ -14,6 +17,7 @@
#include <Common/Exception.h> #include <Common/Exception.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <Poco/String.h> #include <Poco/String.h>
#include <algorithm>
namespace DB namespace DB
@ -26,16 +30,75 @@ namespace DB
} }
} }
std::unique_ptr<AvailableCollationLocales> AvailableCollationLocales::instance_impl;
std::once_flag AvailableCollationLocales::init_flag;
void AvailableCollationLocales::init()
{
instance_impl = std::make_unique<AvailableCollationLocales>();
#if USE_ICU
size_t available_locales_count = ucol_countAvailable();
for (size_t i = 0; i < available_locales_count; ++i)
{
std::string locale_name = ucol_getAvailable(i);
UChar lang_buffer[128]; /// 128 is enough for language name
char normal_buf[128];
UErrorCode status = U_ZERO_ERROR;
/// All names will be in English language
size_t lang_length = uloc_getDisplayLanguage(locale_name.c_str(), "en", lang_buffer, 128, &status);
if (U_FAILURE(status))
instance_impl->available_collation_locales.push_back(LocaleAndLanguage{locale_name, "unknown"});
else
{
u_UCharsToChars(lang_buffer, normal_buf, lang_length);
LocaleAndLanguage result{locale_name, std::string(normal_buf, lang_length)};
instance_impl->available_collation_locales.push_back(result);
}
}
auto comparator = [] (const LocaleAndLanguage & f, const LocaleAndLanguage & s) { return f.locale_name < s.locale_name; };
std::sort(instance_impl->available_collation_locales.begin(), instance_impl->available_collation_locales.end(), comparator);
#endif
}
AvailableCollationLocales & AvailableCollationLocales::instance()
{
std::call_once(init_flag, AvailableCollationLocales::init);
return *instance_impl;
}
const std::vector<AvailableCollationLocales::LocaleAndLanguage> & AvailableCollationLocales::getAvailableCollations() const
{
return available_collation_locales;
}
bool AvailableCollationLocales::isCollationSupported(const std::string & s) const
{
std::string lower = Poco::toLower(s);
for (const auto & locale_and_lang : available_collation_locales)
{
if (lower == Poco::toLower(locale_and_lang.locale_name))
return true;
}
return false;
}
Collator::Collator(const std::string & locale_) : locale(Poco::toLower(locale_)) Collator::Collator(const std::string & locale_) : locale(Poco::toLower(locale_))
{ {
#if USE_ICU #if USE_ICU
/// We check it here, because ucol_open will fallback to default locale for
/// almost all random names.
if (!AvailableCollationLocales::instance().isCollationSupported(locale))
throw DB::Exception("Unsupported collation locale: " + locale, DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
collator = ucol_open(locale.c_str(), &status); collator = ucol_open(locale.c_str(), &status);
if (status != U_ZERO_ERROR) if (U_FAILURE(status))
{ {
ucol_close(collator); ucol_close(collator);
throw DB::Exception("Unsupported collation locale: " + locale, DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE); throw DB::Exception("Failed to open locale: " + locale + " with error: " + u_errorName(status), DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
} }
#else #else
throw DB::Exception("Collations support is disabled, because ClickHouse was built without ICU library", DB::ErrorCodes::SUPPORT_IS_DISABLED); throw DB::Exception("Collations support is disabled, because ClickHouse was built without ICU library", DB::ErrorCodes::SUPPORT_IS_DISABLED);
@ -60,8 +123,8 @@ int Collator::compare(const char * str1, size_t length1, const char * str2, size
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status); UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);
if (status != U_ZERO_ERROR) if (U_FAILURE(status))
throw DB::Exception("ICU collation comparison failed with error code: " + DB::toString<int>(status), throw DB::Exception("ICU collation comparison failed with error code: " + std::string(u_errorName(status)),
DB::ErrorCodes::COLLATION_COMPARISON_FAILED); DB::ErrorCodes::COLLATION_COMPARISON_FAILED);
/** Values of enum UCollationResult are equals to what exactly we need: /** Values of enum UCollationResult are equals to what exactly we need:
@ -83,14 +146,3 @@ const std::string & Collator::getLocale() const
{ {
return locale; return locale;
} }
std::vector<std::string> Collator::getAvailableCollations()
{
std::vector<std::string> result;
#if USE_ICU
size_t available_locales_count = ucol_countAvailable();
for (size_t i = 0; i < available_locales_count; ++i)
result.push_back(ucol_getAvailable(i));
#endif
return result;
}

View File

@ -3,9 +3,38 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <boost/noncopyable.hpp> #include <boost/noncopyable.hpp>
#include <memory>
#include <mutex>
struct UCollator; struct UCollator;
/// Class represents available locales for collations.
class AvailableCollationLocales : private boost::noncopyable
{
public:
struct LocaleAndLanguage
{
std::string locale_name; /// in ISO format
std::string language; /// in English
};
static AvailableCollationLocales & instance();
/// Get all collations with names
const std::vector<LocaleAndLanguage> & getAvailableCollations() const;
/// Check that collation is supported
bool isCollationSupported(const std::string & s) const;
private:
static std::once_flag init_flag;
static std::unique_ptr<AvailableCollationLocales> instance_impl;
static void init();
private:
std::vector<LocaleAndLanguage> available_collation_locales;
};
class Collator : private boost::noncopyable class Collator : private boost::noncopyable
{ {
public: public:
@ -15,10 +44,8 @@ public:
int compare(const char * str1, size_t length1, const char * str2, size_t length2) const; int compare(const char * str1, size_t length1, const char * str2, size_t length2) const;
const std::string & getLocale() const; const std::string & getLocale() const;
static std::vector<std::string> getAvailableCollations();
private: private:
std::string locale; std::string locale;
UCollator * collator; UCollator * collator;
}; };

View File

@ -1,6 +1,7 @@
#include <Interpreters/sortBlock.h> #include <Interpreters/sortBlock.h>
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnConst.h>
#include <Common/typeid_cast.h> #include <Common/typeid_cast.h>
#include <pdqsort.h> #include <pdqsort.h>
@ -14,15 +15,21 @@ namespace ErrorCodes
} }
static inline bool needCollation(const IColumn * column, const SortColumnDescription & description) static inline const IColumn * needCollation(const IColumn * column, const SortColumnDescription & description)
{ {
if (!description.collator) if (!description.collator)
return false; return nullptr;
auto column_result = column;
if (auto const_column = typeid_cast<const ColumnConst *>(column))
column_result = &const_column->getDataColumn();
if (typeid_cast<const ColumnString *>(column_result))
return column_result;
/// TODO Nullable(String)
if (!typeid_cast<const ColumnString *>(column)) /// TODO Nullable(String)
throw Exception("Collations could be specified only for String columns.", ErrorCodes::BAD_COLLATION); throw Exception("Collations could be specified only for String columns.", ErrorCodes::BAD_COLLATION);
return true;
} }
@ -77,9 +84,9 @@ struct PartialSortingLessWithCollation
for (ColumnsWithSortDescriptions::const_iterator it = columns.begin(); it != columns.end(); ++it) for (ColumnsWithSortDescriptions::const_iterator it = columns.begin(); it != columns.end(); ++it)
{ {
int res; int res;
if (needCollation(it->first, it->second)) if (auto column_string_ptr = needCollation(it->first, it->second))
{ {
const ColumnString & column_string = typeid_cast<const ColumnString &>(*it->first); const ColumnString & column_string = typeid_cast<const ColumnString &>(*column_string_ptr);
res = column_string.compareAtWithCollation(a, b, *it->first, *it->second.collator); res = column_string.compareAtWithCollation(a, b, *it->first, *it->second.collator);
} }
else else
@ -110,9 +117,9 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
: block.safeGetByPosition(description[0].column_number).column.get(); : block.safeGetByPosition(description[0].column_number).column.get();
IColumn::Permutation perm; IColumn::Permutation perm;
if (needCollation(column, description[0])) if (auto column_string_ptr = needCollation(column, description[0]))
{ {
const ColumnString & column_string = typeid_cast<const ColumnString &>(*column); const ColumnString & column_string = typeid_cast<const ColumnString &>(*column_string_ptr);
column_string.getPermutationWithCollation(*description[0].collator, reverse, limit, perm); column_string.getPermutationWithCollation(*description[0].collator, reverse, limit, perm);
} }
else else

View File

@ -8,13 +8,17 @@ NamesAndTypesList StorageSystemCollations::getNamesAndTypes()
{ {
return { return {
{"name", std::make_shared<DataTypeString>()}, {"name", std::make_shared<DataTypeString>()},
{"language", std::make_shared<DataTypeString>()},
}; };
} }
void StorageSystemCollations::fillData(MutableColumns & res_columns, const Context &, const SelectQueryInfo &) const void StorageSystemCollations::fillData(MutableColumns & res_columns, const Context &, const SelectQueryInfo &) const
{ {
for (const auto & collation_name : Collator::getAvailableCollations()) for (const auto & [locale, lang]: AvailableCollationLocales::instance().getAvailableCollations())
res_columns[0]->insert(collation_name); {
res_columns[0]->insert(locale);
res_columns[1]->insert(lang);
}
} }
} }

View File

@ -1,15 +1,18 @@
Русский (default)
Ё Ё
А А
Я Я
а а
я я
ё ё
Русский (ru)
а а
А А
ё ё
Ё Ё
я я
Я Я
Русский (ru distributed)
а а
а а
А А
@ -22,6 +25,7 @@
я я
Я Я
Я Я
Türk (default)
A A
A A
B B
@ -132,6 +136,7 @@ z
ı ı
Ş Ş
ş ş
Türk (tr)
a a
a a
A A
@ -242,9 +247,49 @@ z
z z
Z Z
Z Z
english (default)
A
Q
Z
c
e
english (en_US)
A
c
e
Q
Z
english (en)
A
c
e
Q
Z
español (default)
F
J
z
Ñ
español (es)
F
J
Ñ
z
Український (default)
І
Б
ї
ґ
Український (uk)
Б
ґ
І
ї
Русский (ru group by)
а 1 а 1
А 4 А 4
ё 3 ё 3
Ё 6 Ё 6
я 2 я 2
Я 5 Я 5
ζ

View File

@ -1,6 +1,44 @@
SELECT 'Русский (default)';
SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x ORDER BY x; SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x ORDER BY x;
SELECT 'Русский (ru)';
SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x ORDER BY x COLLATE 'ru'; SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x ORDER BY x COLLATE 'ru';
SELECT 'Русский (ru distributed)';
SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x FROM remote('127.0.0.{2,3}', system, one) ORDER BY x COLLATE 'ru'; SELECT arrayJoin(['а', 'я', 'ё', 'А', 'Я', 'Ё']) AS x FROM remote('127.0.0.{2,3}', system, one) ORDER BY x COLLATE 'ru';
SELECT 'Türk (default)';
SELECT arrayJoin(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', 'A', 'B', 'C', 'Ç', 'D', 'E', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K', 'L', 'M', 'N', 'O', 'Ö', 'P', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'Y', 'Z']) AS x ORDER BY x; SELECT arrayJoin(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', 'A', 'B', 'C', 'Ç', 'D', 'E', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K', 'L', 'M', 'N', 'O', 'Ö', 'P', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'Y', 'Z']) AS x ORDER BY x;
SELECT 'Türk (tr)';
SELECT arrayJoin(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', 'A', 'B', 'C', 'Ç', 'D', 'E', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K', 'L', 'M', 'N', 'O', 'Ö', 'P', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'Y', 'Z']) AS x ORDER BY x COLLATE 'tr'; SELECT arrayJoin(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't', 'u', 'ü', 'v', 'y', 'z', 'A', 'B', 'C', 'Ç', 'D', 'E', 'F', 'G', 'Ğ', 'H', 'I', 'İ', 'J', 'K', 'L', 'M', 'N', 'O', 'Ö', 'P', 'R', 'S', 'Ş', 'T', 'U', 'Ü', 'V', 'Y', 'Z']) AS x ORDER BY x COLLATE 'tr';
SELECT 'english (default)';
SELECT arrayJoin(['A', 'c', 'Z', 'Q', 'e']) AS x ORDER BY x;
SELECT 'english (en_US)';
SELECT arrayJoin(['A', 'c', 'Z', 'Q', 'e']) AS x ORDER BY x COLLATE 'en_US';
SELECT 'english (en)';
SELECT arrayJoin(['A', 'c', 'Z', 'Q', 'e']) AS x ORDER BY x COLLATE 'en';
SELECT 'español (default)';
SELECT arrayJoin(['F', 'z', 'J', 'Ñ']) as x ORDER BY x;
SELECT 'español (es)';
SELECT arrayJoin(['F', 'z', 'J', 'Ñ']) as x ORDER BY x COLLATE 'es';
SELECT 'Український (default)';
SELECT arrayJoin(['ґ', 'ї', 'І', 'Б']) as x ORDER BY x;
SELECT 'Український (uk)';
SELECT arrayJoin(['ґ', 'ї', 'І', 'Б']) as x ORDER BY x COLLATE 'uk';
SELECT 'Русский (ru group by)';
SELECT x, n FROM (SELECT ['а', 'я', 'ё', 'А', 'Я', 'Ё'] AS arr) ARRAY JOIN arr AS x, arrayEnumerate(arr) AS n ORDER BY x COLLATE 'ru', n; SELECT x, n FROM (SELECT ['а', 'я', 'ё', 'А', 'Я', 'Ё'] AS arr) ARRAY JOIN arr AS x, arrayEnumerate(arr) AS n ORDER BY x COLLATE 'ru', n;
--- Const expression
SELECT 'ζ' as x ORDER BY x COLLATE 'el';
--- Trash locales
SELECT '' as x ORDER BY x COLLATE 'qq'; --{serverError 186}
SELECT '' as x ORDER BY x COLLATE 'qwe'; --{serverError 186}
SELECT '' as x ORDER BY x COLLATE 'some_non_existing_locale'; --{serverError 186}
SELECT '' as x ORDER BY x COLLATE 'ру'; --{serverError 186}