mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Using ICU instead of iconv. Improved performance of 'convertCharset' function [#CLICKHOUSE-2879].
This commit is contained in:
parent
bd70653fc6
commit
93bb49cecd
@ -348,7 +348,7 @@ namespace ErrorCodes
|
||||
extern const int METADATA_MISMATCH = 342;
|
||||
extern const int SUPPORT_IS_DISABLED = 344;
|
||||
extern const int TABLE_DIFFERS_TOO_MUCH = 345;
|
||||
extern const int CANNOT_ICONV = 346;
|
||||
extern const int CANNOT_CONVERT_CHARSET = 346;
|
||||
extern const int CANNOT_LOAD_CONFIG = 347;
|
||||
extern const int RESHARDING_NULLABLE_SHARDING_KEY = 348;
|
||||
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN = 349;
|
||||
@ -362,6 +362,7 @@ namespace ErrorCodes
|
||||
extern const int RESERVED_IDENTIFIER_NAME = 357;
|
||||
extern const int INTO_OUTFILE_NOT_ALLOWED = 358;
|
||||
extern const int TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT = 359;
|
||||
extern const int CANNOT_CREATE_CHARSET_CONVERTER = 360;
|
||||
|
||||
extern const int KEEPER_EXCEPTION = 999;
|
||||
extern const int POCO_EXCEPTION = 1000;
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <DB/Columns/ColumnConst.h>
|
||||
#include <ext/range.hpp>
|
||||
|
||||
#include <iconv.h>
|
||||
#include <unicode/ucnv.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -17,7 +17,8 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int CANNOT_ICONV;
|
||||
extern const int CANNOT_CREATE_CHARSET_CONVERTER;
|
||||
extern const int CANNOT_CONVERT_CHARSET;
|
||||
}
|
||||
|
||||
|
||||
@ -33,50 +34,63 @@ namespace ErrorCodes
|
||||
class FunctionConvertCharset : public IFunction
|
||||
{
|
||||
private:
|
||||
using CharsetsFromTo = std::pair<String, String>;
|
||||
|
||||
struct IConv
|
||||
struct Converter : private boost::noncopyable
|
||||
{
|
||||
iconv_t impl;
|
||||
UConverter * impl;
|
||||
|
||||
IConv(const CharsetsFromTo & charsets)
|
||||
Converter(const String & charset)
|
||||
{
|
||||
impl = iconv_open(charsets.second.data(), charsets.first.data());
|
||||
if (impl == reinterpret_cast<iconv_t>(-1))
|
||||
throwFromErrno("Cannot iconv_open with charsets " + charsets.first + " and " + charsets.second,
|
||||
ErrorCodes::BAD_ARGUMENTS);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
impl = ucnv_open(charset.data(), &status);
|
||||
|
||||
if (U_SUCCESS(status))
|
||||
ucnv_setToUCallBack(impl,
|
||||
UCNV_TO_U_CALLBACK_SUBSTITUTE,
|
||||
nullptr,
|
||||
nullptr, nullptr,
|
||||
&status);
|
||||
|
||||
if (U_SUCCESS(status))
|
||||
ucnv_setFromUCallBack(impl,
|
||||
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
|
||||
nullptr,
|
||||
nullptr, nullptr,
|
||||
&status);
|
||||
|
||||
if (!U_SUCCESS(status))
|
||||
throw Exception("Cannot create UConverter with charset " + charset + ", error: " + String(u_errorName(status)),
|
||||
ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER);
|
||||
}
|
||||
|
||||
~IConv()
|
||||
~Converter()
|
||||
{
|
||||
if (-1 == iconv_close(impl))
|
||||
std::terminate();
|
||||
ucnv_close(impl);
|
||||
}
|
||||
};
|
||||
|
||||
/// Separate converter is created for each thread.
|
||||
using Pool = ObjectPoolMap<IConv, CharsetsFromTo>;
|
||||
using Pool = ObjectPoolMap<Converter, String>;
|
||||
|
||||
Pool::Pointer getConverter(const CharsetsFromTo & charsets)
|
||||
Pool::Pointer getConverter(const String & charset)
|
||||
{
|
||||
static Pool pool;
|
||||
return pool.get(charsets, [&charsets] { return new IConv(charsets); });
|
||||
return pool.get(charset, [&charset] { return new Converter(charset); });
|
||||
}
|
||||
|
||||
void convert(const String & from_charset, const String & to_charset,
|
||||
const ColumnString::Chars_t & from_chars, const ColumnString::Offsets_t & from_offsets,
|
||||
ColumnString::Chars_t & to_chars, ColumnString::Offsets_t & to_offsets)
|
||||
{
|
||||
auto converter = getConverter(CharsetsFromTo(from_charset, to_charset));
|
||||
iconv_t iconv_state = converter->impl;
|
||||
|
||||
to_chars.resize(from_chars.size());
|
||||
to_offsets.resize(from_offsets.size());
|
||||
auto converter_from = getConverter(from_charset);
|
||||
auto converter_to = getConverter(to_charset);
|
||||
|
||||
ColumnString::Offset_t current_from_offset = 0;
|
||||
ColumnString::Offset_t current_to_offset = 0;
|
||||
|
||||
size_t size = from_offsets.size();
|
||||
to_offsets.resize(size);
|
||||
|
||||
PODArray<UChar> uchars;
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
@ -85,37 +99,40 @@ private:
|
||||
/// We assume that empty string is empty in every charset.
|
||||
if (0 != from_string_size)
|
||||
{
|
||||
/// reset state of iconv
|
||||
size_t res = iconv(iconv_state, nullptr, nullptr, nullptr, nullptr);
|
||||
if (static_cast<size_t>(-1) == res)
|
||||
throwFromErrno("Cannot reset iconv", ErrorCodes::CANNOT_ICONV);
|
||||
/// reset state of converter
|
||||
ucnv_reset(converter_from->impl);
|
||||
ucnv_reset(converter_to->impl);
|
||||
|
||||
/// perform conversion; resize output buffer and continue if required
|
||||
/// maximum number of code points is number of bytes in input string plus one for terminating zero
|
||||
uchars.resize(from_string_size + 1);
|
||||
|
||||
char * in_buf = const_cast<char *>(reinterpret_cast<const char *>(&from_chars[current_from_offset]));
|
||||
size_t in_bytes_left = from_string_size;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t res = ucnv_toUChars(
|
||||
converter_from->impl,
|
||||
uchars.data(), uchars.size(),
|
||||
reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,
|
||||
&status);
|
||||
|
||||
char * out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]);
|
||||
size_t out_bytes_left = to_chars.size() - current_to_offset;
|
||||
if (!U_SUCCESS(status))
|
||||
throw Exception("Cannot convert from charset " + from_charset + ", error: " + String(u_errorName(status)),
|
||||
ErrorCodes::CANNOT_CONVERT_CHARSET);
|
||||
|
||||
while (in_bytes_left)
|
||||
{
|
||||
size_t res = iconv(iconv_state, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
|
||||
current_to_offset = to_chars.size() - out_bytes_left;
|
||||
auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);
|
||||
auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);
|
||||
|
||||
if (static_cast<size_t>(-1) == res)
|
||||
{
|
||||
if (E2BIG == errno)
|
||||
{
|
||||
to_chars.resize(to_chars.size() * 2);
|
||||
out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]);
|
||||
out_bytes_left = to_chars.size() - current_to_offset;
|
||||
continue;
|
||||
}
|
||||
to_chars.resize(current_to_offset + max_to_size);
|
||||
|
||||
throwFromErrno("Cannot convert charset", ErrorCodes::CANNOT_ICONV);
|
||||
}
|
||||
}
|
||||
res = ucnv_fromUChars(
|
||||
converter_to->impl,
|
||||
reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,
|
||||
uchars.data(), res,
|
||||
&status);
|
||||
|
||||
if (!U_SUCCESS(status))
|
||||
throw Exception("Cannot convert to charset " + to_charset + ", error: " + String(u_errorName(status)),
|
||||
ErrorCodes::CANNOT_CONVERT_CHARSET);
|
||||
|
||||
current_to_offset += res;
|
||||
}
|
||||
|
||||
if (to_chars.size() < current_to_offset + 1)
|
||||
|
@ -0,0 +1,28 @@
|
||||
Row 1:
|
||||
──────
|
||||
orig: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
cp1251_hex: E0E1E2E3E4E5B8E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFFC0C1C2C3C4C5A8C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF
|
||||
utf7_hex: 2B424441454D51517942444D454E415131424645454E675133424467454F5151364244734550415139424434455077524142454545516752444245514552515247424563455341524A42456F455377524D4245304554675250424241454551515342424D45464151564241454546675158424267454751516142427345484151644242344548775167424345454967516A424351454A51516D424363454B41517042436F454B775173424330454C6751762D
|
||||
bocu1_hex: D3E48182838485A1868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F60616263646551666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
|
||||
scsu_hex: 12B0B1B2B3B4B5D1B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECF90919293949581969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAF
|
||||
orig2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
broken1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗРЙКЛМНОПРСТУФХЦЧШЩЪЫЬРЮЯ
|
||||
broken2: абвгдеÑжзийклмнопÑÑÑÑÑÑ
ÑÑÑÑÑÑÑÑÑÑÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐРСТУФХЦЧШЩЪЫЬÐЮЯ
|
||||
broken3: п╟п╠п╡пЁп╢п╣я▒п╤п╥п╦п╧п╨п╩п╪п╫п╬п©я─я│я┌я┐я└я┘я├я┤я┬я┴я┼я▀я▄я█я▌я▐п░п▒п▓п⌠п■п∙п│п√п≈п≤п≥п п⌡п°п²п·п÷п═п║п╒пёп╓п╔п╕п╖п╗п╘п╙п╚п╛п╜п╝п╞
|
||||
restored1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
restored2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
restored3: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
Row 1:
|
||||
──────
|
||||
orig: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
cp1251_hex: E0E1E2E3E4E5B8E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFFC0C1C2C3C4C5A8C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF
|
||||
utf7_hex: 2B424441454D51517942444D454E415131424645454E675133424467454F5151364244734550415139424434455077524142454545516752444245514552515247424563455341524A42456F455377524D4245304554675250424241454551515342424D45464151564241454546675158424267454751516142427345484151644242344548775167424345454967516A424351454A51516D424363454B41517042436F454B775173424330454C6751762D
|
||||
bocu1_hex: D3E48182838485A1868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F60616263646551666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
|
||||
scsu_hex: 12B0B1B2B3B4B5D1B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECF90919293949581969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAF
|
||||
orig2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
broken1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗРЙКЛМНОПРСТУФХЦЧШЩЪЫЬРЮЯ
|
||||
broken2: абвгдеÑжзийклмнопÑÑÑÑÑÑ
ÑÑÑÑÑÑÑÑÑÑÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐРСТУФХЦЧШЩЪЫЬÐЮЯ
|
||||
broken3: п╟п╠п╡пЁп╢п╣я▒п╤п╥п╦п╧п╨п╩п╪п╫п╬п©я─я│я┌я┐я└я┘я├я┤я┬я┴я┼я▀я▄я█я▌я▐п░п▒п▓п⌠п■п∙п│п√п≈п≤п≥п п⌡п°п²п·п÷п═п║п╒пёп╓п╔п╕п╖п╗п╘п╙п╚п╛п╜п╝п╞
|
||||
restored1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
restored2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||
restored3: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
29
dbms/tests/queries/0_stateless/00436_convert_charset.sql
Normal file
29
dbms/tests/queries/0_stateless/00436_convert_charset.sql
Normal file
@ -0,0 +1,29 @@
|
||||
SELECT
|
||||
'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' AS orig,
|
||||
hex(convertCharset(orig, 'utf-8', 'cp1251') AS cp1251) AS cp1251_hex,
|
||||
hex(convertCharset(orig, 'utf-8', 'utf-7')) AS utf7_hex,
|
||||
hex(convertCharset(orig, 'utf-8', 'bocu-1')) AS bocu1_hex,
|
||||
hex(convertCharset(orig, 'utf-8', 'scsu')) AS scsu_hex,
|
||||
convertCharset(cp1251, 'cp1251', 'utf-8') AS orig2,
|
||||
convertCharset(orig, 'cp1251', 'utf8') AS broken1,
|
||||
convertCharset(orig, 'latin1', 'utf8') AS broken2,
|
||||
convertCharset(orig, 'koi8-r', 'utf8') AS broken3,
|
||||
convertCharset(broken1, 'utf-8', 'cp1251') AS restored1,
|
||||
convertCharset(broken2, 'utf-8', 'latin1') AS restored2,
|
||||
convertCharset(broken3, 'utf-8', 'koi8-r') AS restored3
|
||||
FORMAT Vertical;
|
||||
|
||||
SELECT
|
||||
materialize('абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ') AS orig,
|
||||
hex(convertCharset(orig, 'utf-8', 'cp1251') AS cp1251) AS cp1251_hex,
|
||||
hex(convertCharset(orig, 'utf-8', 'utf-7')) AS utf7_hex,
|
||||
hex(convertCharset(orig, 'utf-8', 'bocu-1')) AS bocu1_hex,
|
||||
hex(convertCharset(orig, 'utf-8', 'scsu')) AS scsu_hex,
|
||||
convertCharset(cp1251, 'cp1251', 'utf-8') AS orig2,
|
||||
convertCharset(orig, 'cp1251', 'utf8') AS broken1,
|
||||
convertCharset(orig, 'latin1', 'utf8') AS broken2,
|
||||
convertCharset(orig, 'koi8-r', 'utf8') AS broken3,
|
||||
convertCharset(broken1, 'utf-8', 'cp1251') AS restored1,
|
||||
convertCharset(broken2, 'utf-8', 'latin1') AS restored2,
|
||||
convertCharset(broken3, 'utf-8', 'koi8-r') AS restored3
|
||||
FORMAT Vertical;
|
Loading…
Reference in New Issue
Block a user