Using ICU instead of iconv. Improved performance of 'convertCharset' function [#CLICKHOUSE-2879].

This commit is contained in:
Alexey Milovidov 2017-03-11 07:36:14 +03:00
parent bd70653fc6
commit 93bb49cecd
4 changed files with 123 additions and 48 deletions

View File

@ -348,7 +348,7 @@ namespace ErrorCodes
extern const int METADATA_MISMATCH = 342;
extern const int SUPPORT_IS_DISABLED = 344;
extern const int TABLE_DIFFERS_TOO_MUCH = 345;
extern const int CANNOT_ICONV = 346;
extern const int CANNOT_CONVERT_CHARSET = 346;
extern const int CANNOT_LOAD_CONFIG = 347;
extern const int RESHARDING_NULLABLE_SHARDING_KEY = 348;
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN = 349;
@ -362,6 +362,7 @@ namespace ErrorCodes
extern const int RESERVED_IDENTIFIER_NAME = 357;
extern const int INTO_OUTFILE_NOT_ALLOWED = 358;
extern const int TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT = 359;
extern const int CANNOT_CREATE_CHARSET_CONVERTER = 360;
extern const int KEEPER_EXCEPTION = 999;
extern const int POCO_EXCEPTION = 1000;

View File

@ -7,7 +7,7 @@
#include <DB/Columns/ColumnConst.h>
#include <ext/range.hpp>
#include <iconv.h>
#include <unicode/ucnv.h>
namespace DB
@ -17,7 +17,8 @@ namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
extern const int CANNOT_ICONV;
extern const int CANNOT_CREATE_CHARSET_CONVERTER;
extern const int CANNOT_CONVERT_CHARSET;
}
@ -33,50 +34,63 @@ namespace ErrorCodes
class FunctionConvertCharset : public IFunction
{
private:
using CharsetsFromTo = std::pair<String, String>;
struct IConv
struct Converter : private boost::noncopyable
{
iconv_t impl;
UConverter * impl;
IConv(const CharsetsFromTo & charsets)
Converter(const String & charset)
{
impl = iconv_open(charsets.second.data(), charsets.first.data());
if (impl == reinterpret_cast<iconv_t>(-1))
throwFromErrno("Cannot iconv_open with charsets " + charsets.first + " and " + charsets.second,
ErrorCodes::BAD_ARGUMENTS);
UErrorCode status = U_ZERO_ERROR;
impl = ucnv_open(charset.data(), &status);
if (U_SUCCESS(status))
ucnv_setToUCallBack(impl,
UCNV_TO_U_CALLBACK_SUBSTITUTE,
nullptr,
nullptr, nullptr,
&status);
if (U_SUCCESS(status))
ucnv_setFromUCallBack(impl,
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
nullptr,
nullptr, nullptr,
&status);
if (!U_SUCCESS(status))
throw Exception("Cannot create UConverter with charset " + charset + ", error: " + String(u_errorName(status)),
ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER);
}
~IConv()
~Converter()
{
if (-1 == iconv_close(impl))
std::terminate();
ucnv_close(impl);
}
};
/// Separate converter is created for each thread.
using Pool = ObjectPoolMap<IConv, CharsetsFromTo>;
using Pool = ObjectPoolMap<Converter, String>;
Pool::Pointer getConverter(const CharsetsFromTo & charsets)
Pool::Pointer getConverter(const String & charset)
{
static Pool pool;
return pool.get(charsets, [&charsets] { return new IConv(charsets); });
return pool.get(charset, [&charset] { return new Converter(charset); });
}
void convert(const String & from_charset, const String & to_charset,
const ColumnString::Chars_t & from_chars, const ColumnString::Offsets_t & from_offsets,
ColumnString::Chars_t & to_chars, ColumnString::Offsets_t & to_offsets)
{
auto converter = getConverter(CharsetsFromTo(from_charset, to_charset));
iconv_t iconv_state = converter->impl;
to_chars.resize(from_chars.size());
to_offsets.resize(from_offsets.size());
auto converter_from = getConverter(from_charset);
auto converter_to = getConverter(to_charset);
ColumnString::Offset_t current_from_offset = 0;
ColumnString::Offset_t current_to_offset = 0;
size_t size = from_offsets.size();
to_offsets.resize(size);
PODArray<UChar> uchars;
for (size_t i = 0; i < size; ++i)
{
@ -85,37 +99,40 @@ private:
/// We assume that empty string is empty in every charset.
if (0 != from_string_size)
{
/// reset state of iconv
size_t res = iconv(iconv_state, nullptr, nullptr, nullptr, nullptr);
if (static_cast<size_t>(-1) == res)
throwFromErrno("Cannot reset iconv", ErrorCodes::CANNOT_ICONV);
/// reset state of converter
ucnv_reset(converter_from->impl);
ucnv_reset(converter_to->impl);
/// perform conversion; resize output buffer and continue if required
/// maximum number of code points is number of bytes in input string plus one for terminating zero
uchars.resize(from_string_size + 1);
char * in_buf = const_cast<char *>(reinterpret_cast<const char *>(&from_chars[current_from_offset]));
size_t in_bytes_left = from_string_size;
UErrorCode status = U_ZERO_ERROR;
int32_t res = ucnv_toUChars(
converter_from->impl,
uchars.data(), uchars.size(),
reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,
&status);
char * out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]);
size_t out_bytes_left = to_chars.size() - current_to_offset;
if (!U_SUCCESS(status))
throw Exception("Cannot convert from charset " + from_charset + ", error: " + String(u_errorName(status)),
ErrorCodes::CANNOT_CONVERT_CHARSET);
while (in_bytes_left)
{
size_t res = iconv(iconv_state, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
current_to_offset = to_chars.size() - out_bytes_left;
auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);
auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);
if (static_cast<size_t>(-1) == res)
{
if (E2BIG == errno)
{
to_chars.resize(to_chars.size() * 2);
out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]);
out_bytes_left = to_chars.size() - current_to_offset;
continue;
}
to_chars.resize(current_to_offset + max_to_size);
throwFromErrno("Cannot convert charset", ErrorCodes::CANNOT_ICONV);
}
}
res = ucnv_fromUChars(
converter_to->impl,
reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,
uchars.data(), res,
&status);
if (!U_SUCCESS(status))
throw Exception("Cannot convert to charset " + to_charset + ", error: " + String(u_errorName(status)),
ErrorCodes::CANNOT_CONVERT_CHARSET);
current_to_offset += res;
}
if (to_chars.size() < current_to_offset + 1)

View File

@ -0,0 +1,28 @@
Row 1:
──────
orig: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
cp1251_hex: E0E1E2E3E4E5B8E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFFC0C1C2C3C4C5A8C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF
utf7_hex: 2B424441454D51517942444D454E415131424645454E675133424467454F5151364244734550415139424434455077524142454545516752444245514552515247424563455341524A42456F455377524D4245304554675250424241454551515342424D45464151564241454546675158424267454751516142427345484151644242344548775167424345454967516A424351454A51516D424363454B41517042436F454B775173424330454C6751762D
bocu1_hex: D3E48182838485A1868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F60616263646551666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
scsu_hex: 12B0B1B2B3B4B5D1B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECF90919293949581969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAF
orig2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
broken1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
broken2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
broken3: п╟п╠п╡пЁп╢п╣я▒п╤п╥п╦п╧п╨п╩п╪п╫п╬п©я─я│я┌я┐я└я┘я├я┤я┬я┴я┼я▀я▄я█я▌я▐п░п▒п▓п⌠п■п∙п│п√п≈п≤п≥п п⌡п°п²п·п÷п═п║п╒пёп╓п╔п╕п╖п╗п╘п╙п╚п╛п╜п╝п╞
restored1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
restored2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
restored3: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
Row 1:
──────
orig: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
cp1251_hex: E0E1E2E3E4E5B8E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFFC0C1C2C3C4C5A8C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF
utf7_hex: 2B424441454D51517942444D454E415131424645454E675133424467454F5151364244734550415139424434455077524142454545516752444245514552515247424563455341524A42456F455377524D4245304554675250424241454551515342424D45464151564241454546675158424267454751516142427345484151644242344548775167424345454967516A424351454A51516D424363454B41517042436F454B775173424330454C6751762D
bocu1_hex: D3E48182838485A1868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9F60616263646551666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F
scsu_hex: 12B0B1B2B3B4B5D1B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECF90919293949581969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAF
orig2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
broken1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
broken2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
broken3: п╟п╠п╡пЁп╢п╣я▒п╤п╥п╦п╧п╨п╩п╪п╫п╬п©я─я│я┌я┐я└я┘я├я┤я┬я┴я┼я▀я▄я█я▌я▐п░п▒п▓п⌠п■п∙п│п√п≈п≤п≥п п⌡п°п²п·п÷п═п║п╒пёп╓п╔п╕п╖п╗п╘п╙п╚п╛п╜п╝п╞
restored1: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
restored2: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
restored3: абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ

View File

@ -0,0 +1,29 @@
SELECT
'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' AS orig,
hex(convertCharset(orig, 'utf-8', 'cp1251') AS cp1251) AS cp1251_hex,
hex(convertCharset(orig, 'utf-8', 'utf-7')) AS utf7_hex,
hex(convertCharset(orig, 'utf-8', 'bocu-1')) AS bocu1_hex,
hex(convertCharset(orig, 'utf-8', 'scsu')) AS scsu_hex,
convertCharset(cp1251, 'cp1251', 'utf-8') AS orig2,
convertCharset(orig, 'cp1251', 'utf8') AS broken1,
convertCharset(orig, 'latin1', 'utf8') AS broken2,
convertCharset(orig, 'koi8-r', 'utf8') AS broken3,
convertCharset(broken1, 'utf-8', 'cp1251') AS restored1,
convertCharset(broken2, 'utf-8', 'latin1') AS restored2,
convertCharset(broken3, 'utf-8', 'koi8-r') AS restored3
FORMAT Vertical;
SELECT
materialize('абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ') AS orig,
hex(convertCharset(orig, 'utf-8', 'cp1251') AS cp1251) AS cp1251_hex,
hex(convertCharset(orig, 'utf-8', 'utf-7')) AS utf7_hex,
hex(convertCharset(orig, 'utf-8', 'bocu-1')) AS bocu1_hex,
hex(convertCharset(orig, 'utf-8', 'scsu')) AS scsu_hex,
convertCharset(cp1251, 'cp1251', 'utf-8') AS orig2,
convertCharset(orig, 'cp1251', 'utf8') AS broken1,
convertCharset(orig, 'latin1', 'utf8') AS broken2,
convertCharset(orig, 'koi8-r', 'utf8') AS broken3,
convertCharset(broken1, 'utf-8', 'cp1251') AS restored1,
convertCharset(broken2, 'utf-8', 'latin1') AS restored2,
convertCharset(broken3, 'utf-8', 'koi8-r') AS restored3
FORMAT Vertical;