ClickHouse/src/Columns/Collator.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

156 lines
4.5 KiB
C++
Raw Normal View History

2017-11-20 06:01:05 +00:00
#include <Columns/Collator.h>
#include "config.h"
#if USE_ICU
# include <unicode/locid.h>
# include <unicode/ucnv.h>
# include <unicode/ucol.h>
# include <unicode/unistr.h>
2017-07-21 17:59:17 +00:00
#else
# if defined(__clang__)
# pragma clang diagnostic ignored "-Wunused-private-field"
# pragma clang diagnostic ignored "-Wmissing-noreturn"
# endif
#endif
#include <Common/Exception.h>
#include <Poco/String.h>
2019-12-05 15:31:45 +00:00
#include <algorithm>
2022-01-30 19:49:48 +00:00
#include <base/sort.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_COLLATION_LOCALE;
extern const int COLLATION_COMPARISON_FAILED;
extern const int SUPPORT_IS_DISABLED;
}
}
2019-12-05 15:31:45 +00:00
2019-12-06 22:22:12 +00:00
AvailableCollationLocales::AvailableCollationLocales()
2019-12-05 15:31:45 +00:00
{
#if USE_ICU
2019-12-07 08:17:01 +00:00
static const size_t MAX_LANG_LENGTH = 128;
2019-12-05 15:31:45 +00:00
size_t available_locales_count = ucol_countAvailable();
for (size_t i = 0; i < available_locales_count; ++i)
{
std::string locale_name = ucol_getAvailable(i);
2019-12-06 22:22:12 +00:00
UChar lang_buffer[MAX_LANG_LENGTH];
char normal_buf[MAX_LANG_LENGTH];
2019-12-05 15:31:45 +00:00
UErrorCode status = U_ZERO_ERROR;
2019-12-06 22:22:12 +00:00
2019-12-05 15:31:45 +00:00
/// All names will be in English language
2019-12-06 22:22:12 +00:00
size_t lang_length = uloc_getDisplayLanguage(
locale_name.c_str(), "en", lang_buffer, MAX_LANG_LENGTH, &status);
std::optional<std::string> lang;
if (!U_FAILURE(status))
2019-12-05 15:31:45 +00:00
{
2019-12-06 22:22:12 +00:00
/// Convert language name from UChar array to normal char array.
/// We use English language for name, so all UChar's length is equal to sizeof(char)
2019-12-05 15:31:45 +00:00
u_UCharsToChars(lang_buffer, normal_buf, lang_length);
2019-12-06 22:22:12 +00:00
lang.emplace(std::string(normal_buf, lang_length));
2019-12-05 15:31:45 +00:00
}
2019-12-06 22:22:12 +00:00
locales_map.emplace(Poco::toLower(locale_name), LocaleAndLanguage{locale_name, lang});
2019-12-05 15:31:45 +00:00
}
#endif
}
2019-12-06 22:22:12 +00:00
const AvailableCollationLocales & AvailableCollationLocales::instance()
2019-12-05 15:31:45 +00:00
{
2019-12-06 22:22:12 +00:00
static AvailableCollationLocales instance;
return instance;
2019-12-05 15:31:45 +00:00
}
2019-12-06 22:22:12 +00:00
AvailableCollationLocales::LocalesVector AvailableCollationLocales::getAvailableCollations() const
2019-12-05 15:31:45 +00:00
{
2019-12-06 22:22:12 +00:00
LocalesVector result;
for (const auto & name_and_locale : locales_map)
result.push_back(name_and_locale.second);
auto comparator = [] (const LocaleAndLanguage & f, const LocaleAndLanguage & s)
2022-01-30 19:49:48 +00:00
{
return f.locale_name < s.locale_name;
};
::sort(result.begin(), result.end(), comparator);
2019-12-06 22:22:12 +00:00
return result;
2019-12-05 15:31:45 +00:00
}
2019-12-06 22:22:12 +00:00
bool AvailableCollationLocales::isCollationSupported(const std::string & locale_name) const
2019-12-05 15:31:45 +00:00
{
2019-12-06 22:22:12 +00:00
/// We support locale names in any case, so we have to convert all to lower case
return locales_map.contains(Poco::toLower(locale_name));
2019-12-05 15:31:45 +00:00
}
2019-12-06 22:22:12 +00:00
Collator::Collator(const std::string & locale_)
: locale(Poco::toLower(locale_))
{
#if USE_ICU
2019-12-05 15:31:45 +00:00
/// We check it here, because ucol_open will fallback to default locale for
/// almost all random names.
if (!AvailableCollationLocales::instance().isCollationSupported(locale))
throw DB::Exception(DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE, "Unsupported collation locale: {}", locale);
2019-12-05 15:31:45 +00:00
UErrorCode status = U_ZERO_ERROR;
collator = ucol_open(locale.c_str(), &status);
2019-12-05 15:31:45 +00:00
if (U_FAILURE(status))
{
ucol_close(collator);
throw DB::Exception(DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE, "Failed to open locale: {} with error: {}", locale, u_errorName(status));
}
#else
throw DB::Exception(DB::ErrorCodes::SUPPORT_IS_DISABLED,
"Collations support is disabled, because ClickHouse was built without ICU library");
#endif
}
2020-03-11 17:56:57 +00:00
Collator::~Collator() // NOLINT
{
#if USE_ICU
ucol_close(collator);
#endif
}
int Collator::compare(const char * str1, size_t length1, const char * str2, size_t length2) const
{
#if USE_ICU
UCharIterator iter1, iter2;
uiter_setUTF8(&iter1, str1, length1);
uiter_setUTF8(&iter2, str2, length2);
UErrorCode status = U_ZERO_ERROR;
UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);
2019-12-05 15:31:45 +00:00
if (U_FAILURE(status))
throw DB::Exception(DB::ErrorCodes::COLLATION_COMPARISON_FAILED, "ICU collation comparison failed with error code: {}",
std::string(u_errorName(status)));
/** Values of enum UCollationResult are equals to what exactly we need:
* UCOL_EQUAL = 0
* UCOL_GREATER = 1
* UCOL_LESS = -1
*/
return compare_result;
#else
2017-12-09 19:47:39 +00:00
(void)str1;
(void)length1;
(void)str2;
(void)length2;
return 0;
#endif
}
const std::string & Collator::getLocale() const
{
return locale;
}