ClickHouse/src/Columns/Collator.cpp
Alexander Tokmakov 70d1adfe4b
Better formatting for exception messages (#45449)
* save format string for NetException

* format exceptions

* format exceptions 2

* format exceptions 3

* format exceptions 4

* format exceptions 5

* format exceptions 6

* fix

* format exceptions 7

* format exceptions 8

* Update MergeTreeIndexGin.cpp

* Update AggregateFunctionMap.cpp

* Update AggregateFunctionMap.cpp

* fix
2023-01-24 00:13:58 +03:00

156 lines
4.5 KiB
C++

#include <Columns/Collator.h>
#include "config.h"
#if USE_ICU
# include <unicode/locid.h>
# include <unicode/ucnv.h>
# include <unicode/ucol.h>
# include <unicode/unistr.h>
#else
# if defined(__clang__)
# pragma clang diagnostic ignored "-Wunused-private-field"
# pragma clang diagnostic ignored "-Wmissing-noreturn"
# endif
#endif
#include <Common/Exception.h>
#include <Poco/String.h>
#include <algorithm>
#include <base/sort.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_COLLATION_LOCALE;
extern const int COLLATION_COMPARISON_FAILED;
extern const int SUPPORT_IS_DISABLED;
}
}
AvailableCollationLocales::AvailableCollationLocales()
{
#if USE_ICU
static const size_t MAX_LANG_LENGTH = 128;
size_t available_locales_count = ucol_countAvailable();
for (size_t i = 0; i < available_locales_count; ++i)
{
std::string locale_name = ucol_getAvailable(i);
UChar lang_buffer[MAX_LANG_LENGTH];
char normal_buf[MAX_LANG_LENGTH];
UErrorCode status = U_ZERO_ERROR;
/// All names will be in English language
size_t lang_length = uloc_getDisplayLanguage(
locale_name.c_str(), "en", lang_buffer, MAX_LANG_LENGTH, &status);
std::optional<std::string> lang;
if (!U_FAILURE(status))
{
/// Convert language name from UChar array to normal char array.
/// We use English language for name, so all UChar's length is equal to sizeof(char)
u_UCharsToChars(lang_buffer, normal_buf, lang_length);
lang.emplace(std::string(normal_buf, lang_length));
}
locales_map.emplace(Poco::toLower(locale_name), LocaleAndLanguage{locale_name, lang});
}
#endif
}
const AvailableCollationLocales & AvailableCollationLocales::instance()
{
static AvailableCollationLocales instance;
return instance;
}
AvailableCollationLocales::LocalesVector AvailableCollationLocales::getAvailableCollations() const
{
LocalesVector result;
for (const auto & name_and_locale : locales_map)
result.push_back(name_and_locale.second);
auto comparator = [] (const LocaleAndLanguage & f, const LocaleAndLanguage & s)
{
return f.locale_name < s.locale_name;
};
::sort(result.begin(), result.end(), comparator);
return result;
}
bool AvailableCollationLocales::isCollationSupported(const std::string & locale_name) const
{
/// We support locale names in any case, so we have to convert all to lower case
return locales_map.contains(Poco::toLower(locale_name));
}
Collator::Collator(const std::string & locale_)
: locale(Poco::toLower(locale_))
{
#if USE_ICU
/// We check it here, because ucol_open will fallback to default locale for
/// almost all random names.
if (!AvailableCollationLocales::instance().isCollationSupported(locale))
throw DB::Exception(DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE, "Unsupported collation locale: {}", locale);
UErrorCode status = U_ZERO_ERROR;
collator = ucol_open(locale.c_str(), &status);
if (U_FAILURE(status))
{
ucol_close(collator);
throw DB::Exception(DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE, "Failed to open locale: {} with error: {}", locale, u_errorName(status));
}
#else
throw DB::Exception(DB::ErrorCodes::SUPPORT_IS_DISABLED,
"Collations support is disabled, because ClickHouse was built without ICU library");
#endif
}
Collator::~Collator() // NOLINT
{
#if USE_ICU
ucol_close(collator);
#endif
}
int Collator::compare(const char * str1, size_t length1, const char * str2, size_t length2) const
{
#if USE_ICU
UCharIterator iter1, iter2;
uiter_setUTF8(&iter1, str1, length1);
uiter_setUTF8(&iter2, str2, length2);
UErrorCode status = U_ZERO_ERROR;
UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);
if (U_FAILURE(status))
throw DB::Exception(DB::ErrorCodes::COLLATION_COMPARISON_FAILED, "ICU collation comparison failed with error code: {}",
std::string(u_errorName(status)));
/** Values of enum UCollationResult are equals to what exactly we need:
* UCOL_EQUAL = 0
* UCOL_GREATER = 1
* UCOL_LESS = -1
*/
return compare_result;
#else
(void)str1;
(void)length1;
(void)str2;
(void)length2;
return 0;
#endif
}
const std::string & Collator::getLocale() const
{
return locale;
}