Preparation [#METR-23332].

This commit is contained in:
Alexey Milovidov 2016-10-29 20:13:35 +03:00
parent f9eb536825
commit ebbe877a70
2 changed files with 34 additions and 32 deletions

View File

@ -20,9 +20,6 @@ private:
bool just_put_replacement = false; bool just_put_replacement = false;
std::string replacement; std::string replacement;
/// Таблица взята из ConvertUTF.c от Unicode, Inc. Позволяет узнать длину последовательности по первому байту.
static const char trailingBytesForUTF8[256];
void putReplacement(); void putReplacement();
void putValid(char * data, size_t len); void putValid(char * data, size_t len);

View File

@ -1,5 +1,6 @@
#include <Poco/UTF8Encoding.h> #include <Poco/UTF8Encoding.h>
#include <DB/IO/WriteBufferValidUTF8.h> #include <DB/IO/WriteBufferValidUTF8.h>
#include <DB/Core/Types.h>
#ifdef __x86_64__ #ifdef __x86_64__
#include <emmintrin.h> #include <emmintrin.h>
@ -11,28 +12,31 @@ namespace DB
const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096; const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096;
/** Index into the table below with the first byte of a UTF-8 sequence to namespace
{
/** Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it. * get the number of trailing bytes that are supposed to follow it.
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
* left as-is for anyone who may want to do such conversion, which was * left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms. * allowed in earlier algorithms.
*/ */
const char WriteBufferValidUTF8::trailingBytesForUTF8[256] = const UInt8 length_of_utf8_sequence[256] =
{ {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
}; 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
};
}
WriteBufferValidUTF8::WriteBufferValidUTF8( WriteBufferValidUTF8::WriteBufferValidUTF8(
WriteBuffer & output_buffer, bool group_replacements, const char * replacement, size_t size) WriteBuffer & output_buffer, bool group_replacements, const char * replacement, size_t size)
: BufferWithOwnMemory<WriteBuffer>(std::max(4LU, size)), output_buffer(output_buffer), : BufferWithOwnMemory<WriteBuffer>(std::max(32LU, size)), output_buffer(output_buffer),
group_replacements(group_replacements), replacement(replacement) group_replacements(group_replacements), replacement(replacement)
{ {
} }
@ -60,13 +64,13 @@ inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
void WriteBufferValidUTF8::nextImpl() void WriteBufferValidUTF8::nextImpl()
{ {
char *p = memory.data(); char * p = memory.data();
char *valid_start = p; char * valid_start = p;
while (p < pos) while (p < pos)
{ {
#ifdef __x86_64__ #ifdef __x86_64__
/// Быстрый пропуск ASCII /// Fast skip of ASCII
static constexpr size_t SIMD_BYTES = 16; static constexpr size_t SIMD_BYTES = 16;
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES; const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
@ -77,11 +81,11 @@ void WriteBufferValidUTF8::nextImpl()
break; break;
#endif #endif
size_t len = 1 + static_cast<size_t>(trailingBytesForUTF8[static_cast<unsigned char>(*p)]); size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
if (len > 4) if (len > 4)
{ {
/// Невалидное начало последовательности. Пропустим один байт. /// Invalid start of sequence. Skip one byte.
putValid(valid_start, p - valid_start); putValid(valid_start, p - valid_start);
putReplacement(); putReplacement();
++p; ++p;
@ -89,17 +93,17 @@ void WriteBufferValidUTF8::nextImpl()
} }
else if (p + len > pos) else if (p + len > pos)
{ {
/// Еще не вся последовательность записана. /// Sequence was not fully written to this buffer.
break; break;
} }
else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char*>(p), len)) else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
{ {
/// Валидная последовательность. /// Valid sequence.
p += len; p += len;
} }
else else
{ {
/// Невалидная последовательность. Пропустим только первый байт. /// Invalid sequence. Skip just first byte.
putValid(valid_start, p - valid_start); putValid(valid_start, p - valid_start);
putReplacement(); putReplacement();
++p; ++p;
@ -110,7 +114,8 @@ void WriteBufferValidUTF8::nextImpl()
putValid(valid_start, p - valid_start); putValid(valid_start, p - valid_start);
size_t cnt = pos - p; size_t cnt = pos - p;
/// Сдвинем незаконченную последовательность в начало буфера.
/// Shift unfinished sequence to start of buffer.
for (size_t i = 0; i < cnt; ++i) for (size_t i = 0; i < cnt; ++i)
memory[i] = p[i]; memory[i] = p[i];
@ -120,10 +125,10 @@ void WriteBufferValidUTF8::nextImpl()
void WriteBufferValidUTF8::finish() void WriteBufferValidUTF8::finish()
{ {
/// Выпишем все полные последовательности из буфера. /// Write all complete sequences from buffer.
nextImpl(); nextImpl();
/// Если осталась незаконченная последовательность, запишем replacement. /// If unfinished sequence at end, then write replacement.
if (working_buffer.begin() != memory.data()) if (working_buffer.begin() != memory.data())
putReplacement(); putReplacement();
} }