2017-11-20 06:01:05 +00:00
|
|
|
#include <Columns/Collator.h>
|
2017-01-25 18:19:15 +00:00
|
|
|
|
2022-09-28 13:29:29 +00:00
|
|
|
#include "config.h"
|
2017-07-19 11:22:39 +00:00
|
|
|
|
|
|
|
#if USE_ICU
|
2020-04-16 12:31:57 +00:00
|
|
|
# include <unicode/locid.h>
|
|
|
|
# include <unicode/ucnv.h>
|
|
|
|
# include <unicode/ucol.h>
|
|
|
|
# include <unicode/unistr.h>
|
2017-07-21 17:59:17 +00:00
|
|
|
#else
|
2020-04-16 12:31:57 +00:00
|
|
|
# if defined(__clang__)
|
|
|
|
# pragma clang diagnostic ignored "-Wunused-private-field"
|
|
|
|
# pragma clang diagnostic ignored "-Wmissing-noreturn"
|
|
|
|
# endif
|
2017-07-19 11:22:39 +00:00
|
|
|
#endif
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/Exception.h>
|
2017-01-25 18:19:15 +00:00
|
|
|
#include <Poco/String.h>
|
2019-12-05 15:31:45 +00:00
|
|
|
#include <algorithm>
|
2022-01-30 19:49:48 +00:00
|
|
|
#include <base/sort.h>
|
2017-01-25 18:19:15 +00:00
|
|
|
|
2017-04-19 01:06:29 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int UNSUPPORTED_COLLATION_LOCALE;
|
|
|
|
extern const int COLLATION_COMPARISON_FAILED;
|
|
|
|
extern const int SUPPORT_IS_DISABLED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-05 15:31:45 +00:00
|
|
|
|
2019-12-06 22:22:12 +00:00
|
|
|
AvailableCollationLocales::AvailableCollationLocales()
|
2019-12-05 15:31:45 +00:00
|
|
|
{
|
|
|
|
#if USE_ICU
|
2019-12-07 08:17:01 +00:00
|
|
|
static const size_t MAX_LANG_LENGTH = 128;
|
2019-12-05 15:31:45 +00:00
|
|
|
size_t available_locales_count = ucol_countAvailable();
|
|
|
|
for (size_t i = 0; i < available_locales_count; ++i)
|
|
|
|
{
|
|
|
|
std::string locale_name = ucol_getAvailable(i);
|
2019-12-06 22:22:12 +00:00
|
|
|
UChar lang_buffer[MAX_LANG_LENGTH];
|
|
|
|
char normal_buf[MAX_LANG_LENGTH];
|
2019-12-05 15:31:45 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
2019-12-06 22:22:12 +00:00
|
|
|
|
2019-12-05 15:31:45 +00:00
|
|
|
/// All names will be in English language
|
2019-12-06 22:22:12 +00:00
|
|
|
size_t lang_length = uloc_getDisplayLanguage(
|
|
|
|
locale_name.c_str(), "en", lang_buffer, MAX_LANG_LENGTH, &status);
|
|
|
|
std::optional<std::string> lang;
|
|
|
|
|
|
|
|
if (!U_FAILURE(status))
|
2019-12-05 15:31:45 +00:00
|
|
|
{
|
2019-12-06 22:22:12 +00:00
|
|
|
/// Convert language name from UChar array to normal char array.
|
|
|
|
/// We use English language for name, so all UChar's length is equal to sizeof(char)
|
2019-12-05 15:31:45 +00:00
|
|
|
u_UCharsToChars(lang_buffer, normal_buf, lang_length);
|
2019-12-06 22:22:12 +00:00
|
|
|
lang.emplace(std::string(normal_buf, lang_length));
|
2019-12-05 15:31:45 +00:00
|
|
|
}
|
2019-12-06 22:22:12 +00:00
|
|
|
|
|
|
|
locales_map.emplace(Poco::toLower(locale_name), LocaleAndLanguage{locale_name, lang});
|
2019-12-05 15:31:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2019-12-06 22:22:12 +00:00
|
|
|
const AvailableCollationLocales & AvailableCollationLocales::instance()
|
2019-12-05 15:31:45 +00:00
|
|
|
{
|
2019-12-06 22:22:12 +00:00
|
|
|
static AvailableCollationLocales instance;
|
|
|
|
return instance;
|
2019-12-05 15:31:45 +00:00
|
|
|
}
|
|
|
|
|
2019-12-06 22:22:12 +00:00
|
|
|
AvailableCollationLocales::LocalesVector AvailableCollationLocales::getAvailableCollations() const
|
2019-12-05 15:31:45 +00:00
|
|
|
{
|
2019-12-06 22:22:12 +00:00
|
|
|
LocalesVector result;
|
|
|
|
for (const auto & name_and_locale : locales_map)
|
|
|
|
result.push_back(name_and_locale.second);
|
|
|
|
|
|
|
|
auto comparator = [] (const LocaleAndLanguage & f, const LocaleAndLanguage & s)
|
2022-01-30 19:49:48 +00:00
|
|
|
{
|
|
|
|
return f.locale_name < s.locale_name;
|
|
|
|
};
|
|
|
|
::sort(result.begin(), result.end(), comparator);
|
2019-12-06 22:22:12 +00:00
|
|
|
|
|
|
|
return result;
|
2019-12-05 15:31:45 +00:00
|
|
|
}
|
|
|
|
|
2019-12-06 22:22:12 +00:00
|
|
|
bool AvailableCollationLocales::isCollationSupported(const std::string & locale_name) const
|
2019-12-05 15:31:45 +00:00
|
|
|
{
|
2019-12-06 22:22:12 +00:00
|
|
|
/// We support locale names in any case, so we have to convert all to lower case
|
2022-04-18 10:18:43 +00:00
|
|
|
return locales_map.contains(Poco::toLower(locale_name));
|
2019-12-05 15:31:45 +00:00
|
|
|
}
|
|
|
|
|
2019-12-06 22:22:12 +00:00
|
|
|
Collator::Collator(const std::string & locale_)
|
|
|
|
: locale(Poco::toLower(locale_))
|
2017-01-25 18:19:15 +00:00
|
|
|
{
|
2017-04-19 01:06:29 +00:00
|
|
|
#if USE_ICU
|
2019-12-05 15:31:45 +00:00
|
|
|
/// We check it here, because ucol_open will fallback to default locale for
|
|
|
|
/// almost all random names.
|
|
|
|
if (!AvailableCollationLocales::instance().isCollationSupported(locale))
|
|
|
|
throw DB::Exception("Unsupported collation locale: " + locale, DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
|
|
|
|
|
2017-01-25 18:19:15 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-01-25 18:19:15 +00:00
|
|
|
collator = ucol_open(locale.c_str(), &status);
|
2019-12-05 15:31:45 +00:00
|
|
|
if (U_FAILURE(status))
|
2017-01-25 18:19:15 +00:00
|
|
|
{
|
|
|
|
ucol_close(collator);
|
2019-12-05 15:31:45 +00:00
|
|
|
throw DB::Exception("Failed to open locale: " + locale + " with error: " + u_errorName(status), DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE);
|
2017-01-25 18:19:15 +00:00
|
|
|
}
|
2017-04-19 01:06:29 +00:00
|
|
|
#else
|
|
|
|
throw DB::Exception("Collations support is disabled, because ClickHouse was built without ICU library", DB::ErrorCodes::SUPPORT_IS_DISABLED);
|
|
|
|
#endif
|
2017-01-25 18:19:15 +00:00
|
|
|
}
|
|
|
|
|
2017-04-19 01:06:29 +00:00
|
|
|
|
2020-03-11 17:56:57 +00:00
|
|
|
Collator::~Collator() // NOLINT
|
2017-01-25 18:19:15 +00:00
|
|
|
{
|
2017-04-19 01:06:29 +00:00
|
|
|
#if USE_ICU
|
2017-01-25 18:19:15 +00:00
|
|
|
ucol_close(collator);
|
2017-04-19 01:06:29 +00:00
|
|
|
#endif
|
2017-01-25 18:19:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int Collator::compare(const char * str1, size_t length1, const char * str2, size_t length2) const
|
|
|
|
{
|
2017-04-19 01:06:29 +00:00
|
|
|
#if USE_ICU
|
2017-01-25 18:19:15 +00:00
|
|
|
UCharIterator iter1, iter2;
|
|
|
|
uiter_setUTF8(&iter1, str1, length1);
|
|
|
|
uiter_setUTF8(&iter2, str2, length2);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-01-25 18:19:15 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-12-05 15:31:45 +00:00
|
|
|
if (U_FAILURE(status))
|
|
|
|
throw DB::Exception("ICU collation comparison failed with error code: " + std::string(u_errorName(status)),
|
2017-01-25 18:19:15 +00:00
|
|
|
DB::ErrorCodes::COLLATION_COMPARISON_FAILED);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-01-25 18:19:15 +00:00
|
|
|
/** Values of enum UCollationResult are equals to what exactly we need:
|
|
|
|
* UCOL_EQUAL = 0
|
|
|
|
* UCOL_GREATER = 1
|
|
|
|
* UCOL_LESS = -1
|
|
|
|
*/
|
|
|
|
return compare_result;
|
2017-04-19 01:06:29 +00:00
|
|
|
#else
|
2017-12-09 19:47:39 +00:00
|
|
|
(void)str1;
|
|
|
|
(void)length1;
|
|
|
|
(void)str2;
|
|
|
|
(void)length2;
|
2017-04-19 01:06:29 +00:00
|
|
|
return 0;
|
|
|
|
#endif
|
2017-01-25 18:19:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
const std::string & Collator::getLocale() const
|
|
|
|
{
|
|
|
|
return locale;
|
|
|
|
}
|