2022-09-28 13:29:29 +00:00
|
|
|
#include "config.h"
|
2017-04-19 01:06:29 +00:00
|
|
|
|
2020-04-16 12:31:57 +00:00
|
|
|
#if USE_ICU
|
|
|
|
# include <Columns/ColumnConst.h>
|
|
|
|
# include <Columns/ColumnString.h>
|
|
|
|
# include <DataTypes/DataTypeString.h>
|
|
|
|
# include <Functions/FunctionFactory.h>
|
|
|
|
# include <Functions/FunctionHelpers.h>
|
|
|
|
# include <Functions/IFunction.h>
|
|
|
|
# include <IO/WriteHelpers.h>
|
|
|
|
# include <Common/ObjectPool.h>
|
|
|
|
# include <Common/typeid_cast.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
# include <base/range.h>
|
2016-08-25 21:44:47 +00:00
|
|
|
|
2020-04-16 12:31:57 +00:00
|
|
|
# include <memory>
|
|
|
|
# include <string>
|
|
|
|
# include <unicode/ucnv.h>
|
2016-08-25 21:44:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-02-25 18:02:41 +00:00
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
2017-03-11 04:36:14 +00:00
|
|
|
extern const int CANNOT_CREATE_CHARSET_CONVERTER;
|
|
|
|
extern const int CANNOT_CONVERT_CHARSET;
|
2017-06-13 02:06:53 +00:00
|
|
|
extern const int ILLEGAL_COLUMN;
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
|
|
|
|
2020-09-07 18:00:37 +00:00
|
|
|
namespace
|
|
|
|
{
|
2016-08-25 21:44:47 +00:00
|
|
|
|
|
|
|
/** convertCharset(s, from, to)
|
|
|
|
*
|
|
|
|
* Assuming string 's' contains bytes in charset 'from',
|
|
|
|
* returns another string with bytes, representing same content in charset 'to'.
|
|
|
|
* from and to must be constants.
|
|
|
|
*
|
|
|
|
* When bytes are illegal in 'from' charset or are not representable in 'to' charset,
|
|
|
|
* behavior is implementation specific.
|
|
|
|
*/
|
|
|
|
class FunctionConvertCharset : public IFunction
|
|
|
|
{
|
|
|
|
private:
|
2017-03-11 04:36:14 +00:00
|
|
|
struct Converter : private boost::noncopyable
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2017-03-11 04:36:14 +00:00
|
|
|
UConverter * impl;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-09-07 21:04:48 +00:00
|
|
|
explicit Converter(const String & charset)
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2017-03-11 04:36:14 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
impl = ucnv_open(charset.data(), &status);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
if (U_SUCCESS(status))
|
|
|
|
ucnv_setToUCallBack(impl,
|
|
|
|
UCNV_TO_U_CALLBACK_SUBSTITUTE,
|
|
|
|
nullptr,
|
|
|
|
nullptr, nullptr,
|
|
|
|
&status);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
if (U_SUCCESS(status))
|
|
|
|
ucnv_setFromUCallBack(impl,
|
|
|
|
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
|
|
|
|
nullptr,
|
|
|
|
nullptr, nullptr,
|
|
|
|
&status);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
if (!U_SUCCESS(status))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER, "Cannot create UConverter with charset {}, error: {}",
|
|
|
|
charset, String(u_errorName(status)));
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
~Converter()
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2017-03-11 04:36:14 +00:00
|
|
|
ucnv_close(impl);
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
|
|
|
};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
/// Separate converter is created for each thread.
|
2017-03-11 04:36:14 +00:00
|
|
|
using Pool = ObjectPoolMap<Converter, String>;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-18 00:57:00 +00:00
|
|
|
static Pool::Pointer getConverter(const String & charset)
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
|
|
|
static Pool pool;
|
2017-03-11 04:36:14 +00:00
|
|
|
return pool.get(charset, [&charset] { return new Converter(charset); });
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-18 00:57:00 +00:00
|
|
|
static void convert(const String & from_charset, const String & to_charset,
|
2018-11-25 00:08:50 +00:00
|
|
|
const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets,
|
|
|
|
ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets)
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2017-03-11 04:36:14 +00:00
|
|
|
auto converter_from = getConverter(from_charset);
|
|
|
|
auto converter_to = getConverter(to_charset);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-15 21:32:25 +00:00
|
|
|
ColumnString::Offset current_from_offset = 0;
|
|
|
|
ColumnString::Offset current_to_offset = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
size_t size = from_offsets.size();
|
2017-03-11 04:36:14 +00:00
|
|
|
to_offsets.resize(size);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
PODArray<UChar> uchars;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
size_t from_string_size = from_offsets[i] - current_from_offset - 1;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
/// We assume that empty string is empty in every charset.
|
|
|
|
if (0 != from_string_size)
|
|
|
|
{
|
2017-03-11 04:36:14 +00:00
|
|
|
/// reset state of converter
|
|
|
|
ucnv_reset(converter_from->impl);
|
|
|
|
ucnv_reset(converter_to->impl);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
/// maximum number of code points is number of bytes in input string plus one for terminating zero
|
|
|
|
uchars.resize(from_string_size + 1);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
int32_t res = ucnv_toUChars(
|
|
|
|
converter_from->impl,
|
|
|
|
uchars.data(), uchars.size(),
|
|
|
|
reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,
|
|
|
|
&status);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
if (!U_SUCCESS(status))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_CONVERT_CHARSET, "Cannot convert from charset {}, error: {}",
|
|
|
|
from_charset, String(u_errorName(status)));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);
|
|
|
|
auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
to_chars.resize(current_to_offset + max_to_size);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
res = ucnv_fromUChars(
|
|
|
|
converter_to->impl,
|
|
|
|
reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,
|
|
|
|
uchars.data(), res,
|
|
|
|
&status);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
if (!U_SUCCESS(status))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_CONVERT_CHARSET, "Cannot convert to charset {}, error: {}",
|
|
|
|
to_charset, String(u_errorName(status)));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-11 04:36:14 +00:00
|
|
|
current_to_offset += res;
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
if (to_chars.size() < current_to_offset + 1)
|
|
|
|
to_chars.resize(current_to_offset + 1);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
to_chars[current_to_offset] = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
++current_to_offset;
|
|
|
|
to_offsets[i] = current_to_offset;
|
2016-08-26 02:24:17 +00:00
|
|
|
|
|
|
|
current_from_offset = from_offsets[i];
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
to_chars.resize(current_to_offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
static constexpr auto name = "convertCharset";
|
2021-06-01 12:20:52 +00:00
|
|
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionConvertCharset>(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
String getName() const override
|
|
|
|
{
|
|
|
|
return name;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-12-29 19:38:10 +00:00
|
|
|
size_t getNumberOfArguments() const override { return 3; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-06-22 16:21:23 +00:00
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
2021-04-29 14:48:26 +00:00
|
|
|
|
2016-10-19 15:00:56 +00:00
|
|
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2021-06-15 19:55:21 +00:00
|
|
|
for (size_t i : collections::range(0, 3))
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[i]))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}, must be String",
|
|
|
|
arguments[i]->getName(), getName());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
return std::make_shared<DataTypeString>();
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-23 08:40:43 +00:00
|
|
|
bool useDefaultImplementationForConstants() const override { return true; }
|
|
|
|
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
|
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2020-10-17 16:48:53 +00:00
|
|
|
const ColumnWithTypeAndName & arg_from = arguments[0];
|
|
|
|
const ColumnWithTypeAndName & arg_charset_from = arguments[1];
|
|
|
|
const ColumnWithTypeAndName & arg_charset_to = arguments[2];
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get());
|
|
|
|
const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-25 21:44:47 +00:00
|
|
|
if (!col_charset_from || !col_charset_to)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
|
|
|
"2nd and 3rd arguments of function {} (source charset and destination charset) must "
|
|
|
|
"be constant strings.", getName());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
String charset_from = col_charset_from->getValue<String>();
|
|
|
|
String charset_to = col_charset_to->getValue<String>();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get()))
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
2017-12-14 01:43:19 +00:00
|
|
|
auto col_to = ColumnString::create();
|
2016-08-25 21:44:47 +00:00
|
|
|
convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets());
|
2020-10-17 16:48:53 +00:00
|
|
|
return col_to;
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
|
|
|
else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column passed as first argument of function {} (must be ColumnString).", getName());
|
2016-08-25 21:44:47 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-09-07 18:00:37 +00:00
|
|
|
}
|
2016-08-25 21:44:47 +00:00
|
|
|
|
2022-07-04 07:01:39 +00:00
|
|
|
REGISTER_FUNCTION(ConvertCharset)
|
2016-08-25 21:44:47 +00:00
|
|
|
{
|
|
|
|
factory.registerFunction<FunctionConvertCharset>();
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2017-04-19 01:06:29 +00:00
|
|
|
|
|
|
|
#endif
|