From ebbe877a701bb402a504520c28c844b05854ad61 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 29 Oct 2016 20:13:35 +0300 Subject: [PATCH] Preparation [#METR-23332]. --- dbms/include/DB/IO/WriteBufferValidUTF8.h | 3 -- dbms/src/IO/WriteBufferValidUTF8.cpp | 63 ++++++++++++----------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/dbms/include/DB/IO/WriteBufferValidUTF8.h b/dbms/include/DB/IO/WriteBufferValidUTF8.h index b714e5f32f0..942ad0431b7 100644 --- a/dbms/include/DB/IO/WriteBufferValidUTF8.h +++ b/dbms/include/DB/IO/WriteBufferValidUTF8.h @@ -20,9 +20,6 @@ private: bool just_put_replacement = false; std::string replacement; - /// Таблица взята из ConvertUTF.c от Unicode, Inc. Позволяет узнать длину последовательности по первому байту. - static const char trailingBytesForUTF8[256]; - void putReplacement(); void putValid(char * data, size_t len); diff --git a/dbms/src/IO/WriteBufferValidUTF8.cpp b/dbms/src/IO/WriteBufferValidUTF8.cpp index 277a01d3c49..ed2fdebb500 100644 --- a/dbms/src/IO/WriteBufferValidUTF8.cpp +++ b/dbms/src/IO/WriteBufferValidUTF8.cpp @@ -1,5 +1,6 @@ #include #include +#include #ifdef __x86_64__ #include @@ -11,28 +12,31 @@ namespace DB const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096; -/** Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -const char WriteBufferValidUTF8::trailingBytesForUTF8[256] = +namespace { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; + /** Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ + const UInt8 length_of_utf8_sequence[256] = + { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 + }; +} WriteBufferValidUTF8::WriteBufferValidUTF8( WriteBuffer & output_buffer, bool group_replacements, const char * replacement, size_t size) - : BufferWithOwnMemory(std::max(4LU, size)), output_buffer(output_buffer), + : BufferWithOwnMemory(std::max(32LU, size)), output_buffer(output_buffer), group_replacements(group_replacements), replacement(replacement) { } @@ -60,13 +64,13 @@ inline void WriteBufferValidUTF8::putValid(char *data, size_t len) void WriteBufferValidUTF8::nextImpl() { - char *p = memory.data(); - char *valid_start = p; + char * p = memory.data(); + char * valid_start = p; while (p < pos) { #ifdef __x86_64__ - /// Быстрый пропуск ASCII + /// Fast skip of ASCII static constexpr size_t SIMD_BYTES = 16; const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES; @@ -77,11 +81,11 @@ void WriteBufferValidUTF8::nextImpl() break; #endif - size_t len = 1 + static_cast(trailingBytesForUTF8[static_cast(*p)]); + size_t len = length_of_utf8_sequence[static_cast(*p)]; if (len > 4) { - /// Невалидное начало последовательности. Пропустим один байт. + /// Invalid start of sequence. Skip one byte. putValid(valid_start, p - valid_start); putReplacement(); ++p; @@ -89,17 +93,17 @@ void WriteBufferValidUTF8::nextImpl() } else if (p + len > pos) { - /// Еще не вся последовательность записана. + /// Sequence was not fully written to this buffer. break; } - else if (Poco::UTF8Encoding::isLegal(reinterpret_cast(p), len)) + else if (Poco::UTF8Encoding::isLegal(reinterpret_cast(p), len)) { - /// Валидная последовательность. + /// Valid sequence. p += len; } else { - /// Невалидная последовательность. Пропустим только первый байт. + /// Invalid sequence. Skip just first byte. putValid(valid_start, p - valid_start); putReplacement(); ++p; @@ -110,7 +114,8 @@ void WriteBufferValidUTF8::nextImpl() putValid(valid_start, p - valid_start); size_t cnt = pos - p; - /// Сдвинем незаконченную последовательность в начало буфера. + + /// Shift unfinished sequence to start of buffer. for (size_t i = 0; i < cnt; ++i) memory[i] = p[i]; @@ -120,10 +125,10 @@ void WriteBufferValidUTF8::nextImpl() void WriteBufferValidUTF8::finish() { - /// Выпишем все полные последовательности из буфера. + /// Write all complete sequences from buffer. nextImpl(); - /// Если осталась незаконченная последовательность, запишем replacement. + /// If unfinished sequence at end, then write replacement. if (working_buffer.begin() != memory.data()) putReplacement(); }