mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 02:21:59 +00:00
Using ICU instead of iconv. Improved performance of 'convertCharset' function [#CLICKHOUSE-2879].
This commit is contained in:
parent
bd70653fc6
commit
93bb49cecd
@ -348,7 +348,7 @@ namespace ErrorCodes
|
|||||||
extern const int METADATA_MISMATCH = 342;
|
extern const int METADATA_MISMATCH = 342;
|
||||||
extern const int SUPPORT_IS_DISABLED = 344;
|
extern const int SUPPORT_IS_DISABLED = 344;
|
||||||
extern const int TABLE_DIFFERS_TOO_MUCH = 345;
|
extern const int TABLE_DIFFERS_TOO_MUCH = 345;
|
||||||
extern const int CANNOT_ICONV = 346;
|
extern const int CANNOT_CONVERT_CHARSET = 346;
|
||||||
extern const int CANNOT_LOAD_CONFIG = 347;
|
extern const int CANNOT_LOAD_CONFIG = 347;
|
||||||
extern const int RESHARDING_NULLABLE_SHARDING_KEY = 348;
|
extern const int RESHARDING_NULLABLE_SHARDING_KEY = 348;
|
||||||
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN = 349;
|
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN = 349;
|
||||||
@ -362,6 +362,7 @@ namespace ErrorCodes
|
|||||||
extern const int RESERVED_IDENTIFIER_NAME = 357;
|
extern const int RESERVED_IDENTIFIER_NAME = 357;
|
||||||
extern const int INTO_OUTFILE_NOT_ALLOWED = 358;
|
extern const int INTO_OUTFILE_NOT_ALLOWED = 358;
|
||||||
extern const int TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT = 359;
|
extern const int TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT = 359;
|
||||||
|
extern const int CANNOT_CREATE_CHARSET_CONVERTER = 360;
|
||||||
|
|
||||||
extern const int KEEPER_EXCEPTION = 999;
|
extern const int KEEPER_EXCEPTION = 999;
|
||||||
extern const int POCO_EXCEPTION = 1000;
|
extern const int POCO_EXCEPTION = 1000;
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <DB/Columns/ColumnConst.h>
|
#include <DB/Columns/ColumnConst.h>
|
||||||
#include <ext/range.hpp>
|
#include <ext/range.hpp>
|
||||||
|
|
||||||
#include <iconv.h>
|
#include <unicode/ucnv.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -17,7 +17,8 @@ namespace ErrorCodes
|
|||||||
{
|
{
|
||||||
extern const int BAD_ARGUMENTS;
|
extern const int BAD_ARGUMENTS;
|
||||||
extern const int LOGICAL_ERROR;
|
extern const int LOGICAL_ERROR;
|
||||||
extern const int CANNOT_ICONV;
|
extern const int CANNOT_CREATE_CHARSET_CONVERTER;
|
||||||
|
extern const int CANNOT_CONVERT_CHARSET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -33,50 +34,63 @@ namespace ErrorCodes
|
|||||||
class FunctionConvertCharset : public IFunction
|
class FunctionConvertCharset : public IFunction
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
using CharsetsFromTo = std::pair<String, String>;
|
struct Converter : private boost::noncopyable
|
||||||
|
|
||||||
struct IConv
|
|
||||||
{
|
{
|
||||||
iconv_t impl;
|
UConverter * impl;
|
||||||
|
|
||||||
IConv(const CharsetsFromTo & charsets)
|
Converter(const String & charset)
|
||||||
{
|
{
|
||||||
impl = iconv_open(charsets.second.data(), charsets.first.data());
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
if (impl == reinterpret_cast<iconv_t>(-1))
|
impl = ucnv_open(charset.data(), &status);
|
||||||
throwFromErrno("Cannot iconv_open with charsets " + charsets.first + " and " + charsets.second,
|
|
||||||
ErrorCodes::BAD_ARGUMENTS);
|
if (U_SUCCESS(status))
|
||||||
|
ucnv_setToUCallBack(impl,
|
||||||
|
UCNV_TO_U_CALLBACK_SUBSTITUTE,
|
||||||
|
nullptr,
|
||||||
|
nullptr, nullptr,
|
||||||
|
&status);
|
||||||
|
|
||||||
|
if (U_SUCCESS(status))
|
||||||
|
ucnv_setFromUCallBack(impl,
|
||||||
|
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
|
||||||
|
nullptr,
|
||||||
|
nullptr, nullptr,
|
||||||
|
&status);
|
||||||
|
|
||||||
|
if (!U_SUCCESS(status))
|
||||||
|
throw Exception("Cannot create UConverter with charset " + charset + ", error: " + String(u_errorName(status)),
|
||||||
|
ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER);
|
||||||
}
|
}
|
||||||
|
|
||||||
~IConv()
|
~Converter()
|
||||||
{
|
{
|
||||||
if (-1 == iconv_close(impl))
|
ucnv_close(impl);
|
||||||
std::terminate();
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Separate converter is created for each thread.
|
/// Separate converter is created for each thread.
|
||||||
using Pool = ObjectPoolMap<IConv, CharsetsFromTo>;
|
using Pool = ObjectPoolMap<Converter, String>;
|
||||||
|
|
||||||
Pool::Pointer getConverter(const CharsetsFromTo & charsets)
|
Pool::Pointer getConverter(const String & charset)
|
||||||
{
|
{
|
||||||
static Pool pool;
|
static Pool pool;
|
||||||
return pool.get(charsets, [&charsets] { return new IConv(charsets); });
|
return pool.get(charset, [&charset] { return new Converter(charset); });
|
||||||
}
|
}
|
||||||
|
|
||||||
void convert(const String & from_charset, const String & to_charset,
|
void convert(const String & from_charset, const String & to_charset,
|
||||||
const ColumnString::Chars_t & from_chars, const ColumnString::Offsets_t & from_offsets,
|
const ColumnString::Chars_t & from_chars, const ColumnString::Offsets_t & from_offsets,
|
||||||
ColumnString::Chars_t & to_chars, ColumnString::Offsets_t & to_offsets)
|
ColumnString::Chars_t & to_chars, ColumnString::Offsets_t & to_offsets)
|
||||||
{
|
{
|
||||||
auto converter = getConverter(CharsetsFromTo(from_charset, to_charset));
|
auto converter_from = getConverter(from_charset);
|
||||||
iconv_t iconv_state = converter->impl;
|
auto converter_to = getConverter(to_charset);
|
||||||
|
|
||||||
to_chars.resize(from_chars.size());
|
|
||||||
to_offsets.resize(from_offsets.size());
|
|
||||||
|
|
||||||
ColumnString::Offset_t current_from_offset = 0;
|
ColumnString::Offset_t current_from_offset = 0;
|
||||||
ColumnString::Offset_t current_to_offset = 0;
|
ColumnString::Offset_t current_to_offset = 0;
|
||||||
|
|
||||||
size_t size = from_offsets.size();
|
size_t size = from_offsets.size();
|
||||||
|
to_offsets.resize(size);
|
||||||
|
|
||||||
|
PODArray<UChar> uchars;
|
||||||
|
|
||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i)
|
||||||
{
|
{
|
||||||
@ -85,37 +99,40 @@ private:
|
|||||||
/// We assume that empty string is empty in every charset.
|
/// We assume that empty string is empty in every charset.
|
||||||
if (0 != from_string_size)
|
if (0 != from_string_size)
|
||||||
{
|
{
|
||||||
/// reset state of iconv
|
/// reset state of converter
|
||||||
size_t res = iconv(iconv_state, nullptr, nullptr, nullptr, nullptr);
|
ucnv_reset(converter_from->impl);
|
||||||
if (static_cast<size_t>(-1) == res)
|
ucnv_reset(converter_to->impl);
|
||||||
throwFromErrno("Cannot reset iconv", ErrorCodes::CANNOT_ICONV);
|
|
||||||
|
|
||||||
/// perform conversion; resize output buffer and continue if required
|
/// maximum number of code points is number of bytes in input string plus one for terminating zero
|
||||||
|
uchars.resize(from_string_size + 1);
|
||||||
|
|
||||||
char * in_buf = const_cast<char *>(reinterpret_cast<const char *>(&from_chars[current_from_offset]));
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
size_t in_bytes_left = from_string_size;
|
int32_t res = ucnv_toUChars(
|
||||||
|
converter_from->impl,
|
||||||
|
uchars.data(), uchars.size(),
|
||||||
|
reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,
|
||||||
|
&status);
|
||||||
|
|
||||||
char * out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]);
|
if (!U_SUCCESS(status))
|
||||||
size_t out_bytes_left = to_chars.size() - current_to_offset;
|
throw Exception("Cannot convert from charset " + from_charset + ", error: " + String(u_errorName(status)),
|
||||||
|
ErrorCodes::CANNOT_CONVERT_CHARSET);
|
||||||
|
|
||||||
while (in_bytes_left)
|
auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);
|
||||||
{
|
auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);
|
||||||
size_t res = iconv(iconv_state, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
|
|
||||||
current_to_offset = to_chars.size() - out_bytes_left;
|
|
||||||
|
|
||||||
if (static_cast<size_t>(-1) == res)
|
to_chars.resize(current_to_offset + max_to_size);
|
||||||
{
|
|
||||||
if (E2BIG == errno)
|
|
||||||
{
|
|
||||||
to_chars.resize(to_chars.size() * 2);
|
|
||||||
out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]);
|
|
||||||
out_bytes_left = to_chars.size() - current_to_offset;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
throwFromErrno("Cannot convert charset", ErrorCodes::CANNOT_ICONV);
|
res = ucnv_fromUChars(
|
||||||
}
|
converter_to->impl,
|
||||||
}
|
reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,
|
||||||
|
uchars.data(), res,
|
||||||
|
&status);
|
||||||
|
|
||||||
|
if (!U_SUCCESS(status))
|
||||||
|
throw Exception("Cannot convert to charset " + to_charset + ", error: " + String(u_errorName(status)),
|
||||||
|
ErrorCodes::CANNOT_CONVERT_CHARSET);
|
||||||
|
|
||||||
|
current_to_offset += res;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (to_chars.size() < current_to_offset + 1)
|
if (to_chars.size() < current_to_offset + 1)
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
Row 1:
|
||||||
|
──────
|
||||||
|
orig: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
cp1251_hex: E0E1E2E3E4E5B8E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFFC0C1C2C3C4C5A8C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF
|
||||||
|
utf7_hex: 2B424441454D51517942444D454E415131424645454E675133424467454F5151364244734550415139424434455077524142454545516752444245514552515247424563455341524A42456F455377524D4245304554675250424241454551515342424D45464151564241454546675158424267454751516142427345484151644242344548775167424345454967516A424351454A51516D424363454B41517042436F454B775173424330454C6751762D
|
||||||
|
bocu1_hex: D3E48182838485A1868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F60616263646551666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
|
||||||
|
scsu_hex: 12B0B1B2B3B4B5D1B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECF90919293949581969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAF
|
||||||
|
orig2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
broken1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗРЙКЛМНОПРСТУФХЦЧШЩЪЫЬРЮЯ
|
||||||
|
broken2: абвгдеÑжзийклмнопÑÑÑÑÑÑ
ÑÑÑÑÑÑÑÑÑÑÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐРСТУФХЦЧШЩЪЫЬÐЮЯ
|
||||||
|
broken3: п╟п╠п╡пЁп╢п╣я▒п╤п╥п╦п╧п╨п╩п╪п╫п╬п©я─я│я┌я┐я└я┘я├я┤я┬я┴я┼я▀я▄я█я▌я▐п░п▒п▓п⌠п■п∙п│п√п≈п≤п≥п п⌡п°п²п·п÷п═п║п╒пёп╓п╔п╕п╖п╗п╘п╙п╚п╛п╜п╝п╞
|
||||||
|
restored1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
restored2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
restored3: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
Row 1:
|
||||||
|
──────
|
||||||
|
orig: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
cp1251_hex: E0E1E2E3E4E5B8E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFFC0C1C2C3C4C5A8C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF
|
||||||
|
utf7_hex: 2B424441454D51517942444D454E415131424645454E675133424467454F5151364244734550415139424434455077524142454545516752444245514552515247424563455341524A42456F455377524D4245304554675250424241454551515342424D45464151564241454546675158424267454751516142427345484151644242344548775167424345454967516A424351454A51516D424363454B41517042436F454B775173424330454C6751762D
|
||||||
|
bocu1_hex: D3E48182838485A1868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F60616263646551666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
|
||||||
|
scsu_hex: 12B0B1B2B3B4B5D1B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECF90919293949581969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAF
|
||||||
|
orig2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
broken1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗРЙКЛМНОПРСТУФХЦЧШЩЪЫЬРЮЯ
|
||||||
|
broken2: абвгдеÑжзийклмнопÑÑÑÑÑÑ
ÑÑÑÑÑÑÑÑÑÑÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐÐРСТУФХЦЧШЩЪЫЬÐЮЯ
|
||||||
|
broken3: п╟п╠п╡пЁп╢п╣я▒п╤п╥п╦п╧п╨п╩п╪п╫п╬п©я─я│я┌я┐я└я┘я├я┤я┬я┴я┼я▀я▄я█я▌я▐п░п▒п▓п⌠п■п∙п│п√п≈п≤п≥п п⌡п°п²п·п÷п═п║п╒пёп╓п╔п╕п╖п╗п╘п╙п╚п╛п╜п╝п╞
|
||||||
|
restored1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
restored2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
||||||
|
restored3: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
|
29
dbms/tests/queries/0_stateless/00436_convert_charset.sql
Normal file
29
dbms/tests/queries/0_stateless/00436_convert_charset.sql
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
SELECT
|
||||||
|
'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' AS orig,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'cp1251') AS cp1251) AS cp1251_hex,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'utf-7')) AS utf7_hex,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'bocu-1')) AS bocu1_hex,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'scsu')) AS scsu_hex,
|
||||||
|
convertCharset(cp1251, 'cp1251', 'utf-8') AS orig2,
|
||||||
|
convertCharset(orig, 'cp1251', 'utf8') AS broken1,
|
||||||
|
convertCharset(orig, 'latin1', 'utf8') AS broken2,
|
||||||
|
convertCharset(orig, 'koi8-r', 'utf8') AS broken3,
|
||||||
|
convertCharset(broken1, 'utf-8', 'cp1251') AS restored1,
|
||||||
|
convertCharset(broken2, 'utf-8', 'latin1') AS restored2,
|
||||||
|
convertCharset(broken3, 'utf-8', 'koi8-r') AS restored3
|
||||||
|
FORMAT Vertical;
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
materialize('абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ') AS orig,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'cp1251') AS cp1251) AS cp1251_hex,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'utf-7')) AS utf7_hex,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'bocu-1')) AS bocu1_hex,
|
||||||
|
hex(convertCharset(orig, 'utf-8', 'scsu')) AS scsu_hex,
|
||||||
|
convertCharset(cp1251, 'cp1251', 'utf-8') AS orig2,
|
||||||
|
convertCharset(orig, 'cp1251', 'utf8') AS broken1,
|
||||||
|
convertCharset(orig, 'latin1', 'utf8') AS broken2,
|
||||||
|
convertCharset(orig, 'koi8-r', 'utf8') AS broken3,
|
||||||
|
convertCharset(broken1, 'utf-8', 'cp1251') AS restored1,
|
||||||
|
convertCharset(broken2, 'utf-8', 'latin1') AS restored2,
|
||||||
|
convertCharset(broken3, 'utf-8', 'koi8-r') AS restored3
|
||||||
|
FORMAT Vertical;
|
Loading…
Reference in New Issue
Block a user