2016-08-19 01:54:23 +00:00
|
|
|
#include <Poco/UTF8Encoding.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/WriteBufferValidUTF8.h>
|
Do not catch exceptions during final flush in writers destructors
Since this hides real problems, since destructor does final flush and if
it fails, then data will be lost.
One of such examples if MEMORY_LIMIT_EXCEEDED exception, so lock
exceptions from destructors, by using
MemoryTracker::LockExceptionInThread to block these exception, and allow
others (so std::terminate will be called, since this is c++11 with
noexcept for destructors by default).
Here is an example, that leads to empty block in the distributed batch:
2021.01.21 12:43:18.619739 [ 46468 ] {7bd60d75-ebcb-45d2-874d-260df9a4ddac} <Error> virtual DB::CompressedWriteBuffer::~CompressedWriteBuffer(): Code: 241, e.displayText() = DB::Exception: Memory limit (for user) exceeded: would use 332.07 GiB (attempt to allocate chunk of 4355342 bytes), maximum: 256.00 GiB, Stack trace (when copying this message, always include the lines below):
0. DB::Exception::Exception<>() @ 0x86f7b88 in /usr/bin/clickhouse
...
4. void DB::PODArrayBase<>::resize<>(unsigned long) @ 0xe9e878d in /usr/bin/clickhouse
5. DB::CompressedWriteBuffer::nextImpl() @ 0xe9f0296 in /usr/bin/clickhouse
6. DB::CompressedWriteBuffer::~CompressedWriteBuffer() @ 0xe9f0415 in /usr/bin/clickhouse
7. DB::DistributedBlockOutputStream::writeToShard() @ 0xf6bed4a in /usr/bin/clickhouse
2021-01-22 18:56:50 +00:00
|
|
|
#include <Common/MemoryTracker.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/types.h>
|
2012-11-14 09:49:51 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2017-04-01 07:20:54 +00:00
|
|
|
#include <emmintrin.h>
|
2016-02-14 04:13:12 +00:00
|
|
|
#endif
|
|
|
|
|
2012-11-14 09:49:51 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2016-02-14 04:07:55 +00:00
|
|
|
|
|
|
|
const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096;
|
|
|
|
|
2019-05-17 12:55:21 +00:00
|
|
|
/** Index into the table below with the first byte of a UTF-8 sequence to
|
|
|
|
* get the number of trailing bytes that are supposed to follow it.
|
|
|
|
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
|
|
|
|
* left as-is for anyone who may want to do such conversion, which was
|
|
|
|
* allowed in earlier algorithms.
|
|
|
|
*/
|
|
|
|
extern const UInt8 length_of_utf8_sequence[256] =
|
2016-02-14 04:07:55 +00:00
|
|
|
{
|
2019-05-17 12:55:21 +00:00
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
|
|
|
|
};
|
2012-11-14 09:49:51 +00:00
|
|
|
|
2016-02-14 04:07:55 +00:00
|
|
|
|
|
|
|
WriteBufferValidUTF8::WriteBufferValidUTF8(
|
2019-08-03 11:02:40 +00:00
|
|
|
WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size)
|
|
|
|
: BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_),
|
|
|
|
group_replacements(group_replacements_), replacement(replacement_)
|
2016-02-14 04:07:55 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline void WriteBufferValidUTF8::putReplacement()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (replacement.empty() || (group_replacements && just_put_replacement))
|
|
|
|
return;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
just_put_replacement = true;
|
|
|
|
output_buffer.write(replacement.data(), replacement.size());
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (len == 0)
|
|
|
|
return;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
just_put_replacement = false;
|
|
|
|
output_buffer.write(data, len);
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void WriteBufferValidUTF8::nextImpl()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
char * p = memory.data();
|
|
|
|
char * valid_start = p;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (p < pos)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Fast skip of ASCII
|
|
|
|
static constexpr size_t SIMD_BYTES = 16;
|
|
|
|
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
|
|
|
|
p += SIMD_BYTES;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (!(p < pos))
|
|
|
|
break;
|
2016-02-14 04:07:55 +00:00
|
|
|
#endif
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
|
|
|
|
|
|
|
|
if (len > 4)
|
2020-03-18 00:57:00 +00:00
|
|
|
{ // NOLINT
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Invalid start of sequence. Skip one byte.
|
|
|
|
putValid(valid_start, p - valid_start);
|
|
|
|
putReplacement();
|
|
|
|
++p;
|
|
|
|
valid_start = p;
|
|
|
|
}
|
|
|
|
else if (p + len > pos)
|
|
|
|
{
|
|
|
|
/// Sequence was not fully written to this buffer.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
|
|
|
|
{
|
|
|
|
/// Valid sequence.
|
|
|
|
p += len;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Invalid sequence. Skip just first byte.
|
|
|
|
putValid(valid_start, p - valid_start);
|
|
|
|
putReplacement();
|
|
|
|
++p;
|
|
|
|
valid_start = p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
putValid(valid_start, p - valid_start);
|
|
|
|
|
|
|
|
size_t cnt = pos - p;
|
|
|
|
|
|
|
|
/// Shift unfinished sequence to start of buffer.
|
|
|
|
for (size_t i = 0; i < cnt; ++i)
|
|
|
|
memory[i] = p[i];
|
|
|
|
|
2018-09-02 03:00:04 +00:00
|
|
|
working_buffer = Buffer(&memory[cnt], memory.data() + memory.size());
|
2019-06-05 11:30:29 +00:00
|
|
|
|
|
|
|
/// Propagate next() to the output buffer
|
|
|
|
output_buffer.next();
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void WriteBufferValidUTF8::finish()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Write all complete sequences from buffer.
|
|
|
|
nextImpl();
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// If unfinished sequence at end, then write replacement.
|
|
|
|
if (working_buffer.begin() != memory.data())
|
|
|
|
putReplacement();
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
2019-12-24 17:12:08 +00:00
|
|
|
|
|
|
|
WriteBufferValidUTF8::~WriteBufferValidUTF8()
|
|
|
|
{
|
Do not catch exceptions during final flush in writers destructors
Since this hides real problems, since destructor does final flush and if
it fails, then data will be lost.
One of such examples if MEMORY_LIMIT_EXCEEDED exception, so lock
exceptions from destructors, by using
MemoryTracker::LockExceptionInThread to block these exception, and allow
others (so std::terminate will be called, since this is c++11 with
noexcept for destructors by default).
Here is an example, that leads to empty block in the distributed batch:
2021.01.21 12:43:18.619739 [ 46468 ] {7bd60d75-ebcb-45d2-874d-260df9a4ddac} <Error> virtual DB::CompressedWriteBuffer::~CompressedWriteBuffer(): Code: 241, e.displayText() = DB::Exception: Memory limit (for user) exceeded: would use 332.07 GiB (attempt to allocate chunk of 4355342 bytes), maximum: 256.00 GiB, Stack trace (when copying this message, always include the lines below):
0. DB::Exception::Exception<>() @ 0x86f7b88 in /usr/bin/clickhouse
...
4. void DB::PODArrayBase<>::resize<>(unsigned long) @ 0xe9e878d in /usr/bin/clickhouse
5. DB::CompressedWriteBuffer::nextImpl() @ 0xe9f0296 in /usr/bin/clickhouse
6. DB::CompressedWriteBuffer::~CompressedWriteBuffer() @ 0xe9f0415 in /usr/bin/clickhouse
7. DB::DistributedBlockOutputStream::writeToShard() @ 0xf6bed4a in /usr/bin/clickhouse
2021-01-22 18:56:50 +00:00
|
|
|
/// FIXME move final flush into the caller
|
2021-04-14 20:37:08 +00:00
|
|
|
MemoryTracker::LockExceptionInThread lock(VariableContext::Global);
|
Do not catch exceptions during final flush in writers destructors
Since this hides real problems, since destructor does final flush and if
it fails, then data will be lost.
One of such examples if MEMORY_LIMIT_EXCEEDED exception, so lock
exceptions from destructors, by using
MemoryTracker::LockExceptionInThread to block these exception, and allow
others (so std::terminate will be called, since this is c++11 with
noexcept for destructors by default).
Here is an example, that leads to empty block in the distributed batch:
2021.01.21 12:43:18.619739 [ 46468 ] {7bd60d75-ebcb-45d2-874d-260df9a4ddac} <Error> virtual DB::CompressedWriteBuffer::~CompressedWriteBuffer(): Code: 241, e.displayText() = DB::Exception: Memory limit (for user) exceeded: would use 332.07 GiB (attempt to allocate chunk of 4355342 bytes), maximum: 256.00 GiB, Stack trace (when copying this message, always include the lines below):
0. DB::Exception::Exception<>() @ 0x86f7b88 in /usr/bin/clickhouse
...
4. void DB::PODArrayBase<>::resize<>(unsigned long) @ 0xe9e878d in /usr/bin/clickhouse
5. DB::CompressedWriteBuffer::nextImpl() @ 0xe9f0296 in /usr/bin/clickhouse
6. DB::CompressedWriteBuffer::~CompressedWriteBuffer() @ 0xe9f0415 in /usr/bin/clickhouse
7. DB::DistributedBlockOutputStream::writeToShard() @ 0xf6bed4a in /usr/bin/clickhouse
2021-01-22 18:56:50 +00:00
|
|
|
finish();
|
2019-12-24 17:12:08 +00:00
|
|
|
}
|
|
|
|
|
2012-11-14 09:49:51 +00:00
|
|
|
}
|