2016-08-19 01:54:23 +00:00
|
|
|
#include <Poco/UTF8Encoding.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/WriteBufferValidUTF8.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/types.h>
|
2012-11-14 09:49:51 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2017-04-01 07:20:54 +00:00
|
|
|
#include <emmintrin.h>
|
2016-02-14 04:13:12 +00:00
|
|
|
#endif
|
|
|
|
|
2022-06-15 13:19:29 +00:00
|
|
|
#if defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
# include <arm_neon.h>
|
|
|
|
# ifdef HAS_RESERVED_IDENTIFIER
|
|
|
|
# pragma clang diagnostic ignored "-Wreserved-identifier"
|
|
|
|
# endif
|
|
|
|
#endif
|
2012-11-14 09:49:51 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2022-06-15 13:19:29 +00:00
|
|
|
#if defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
inline uint64_t getNibbleMask(uint8x16_t res)
|
|
|
|
{
|
|
|
|
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(res), 4)), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool onlyASCII(uint8x16_t input)
|
|
|
|
{
|
|
|
|
return getNibbleMask(vcgeq_u8(input, vdupq_n_u8(0x80))) == 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2016-02-14 04:07:55 +00:00
|
|
|
const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096;
|
|
|
|
|
2019-05-17 12:55:21 +00:00
|
|
|
/** Index into the table below with the first byte of a UTF-8 sequence to
|
|
|
|
* get the number of trailing bytes that are supposed to follow it.
|
|
|
|
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
|
|
|
|
* left as-is for anyone who may want to do such conversion, which was
|
|
|
|
* allowed in earlier algorithms.
|
|
|
|
*/
|
|
|
|
extern const UInt8 length_of_utf8_sequence[256] =
|
2016-02-14 04:07:55 +00:00
|
|
|
{
|
2019-05-17 12:55:21 +00:00
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
|
|
|
|
};
|
2012-11-14 09:49:51 +00:00
|
|
|
|
2016-02-14 04:07:55 +00:00
|
|
|
|
|
|
|
WriteBufferValidUTF8::WriteBufferValidUTF8(
|
2019-08-03 11:02:40 +00:00
|
|
|
WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size)
|
|
|
|
: BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_),
|
|
|
|
group_replacements(group_replacements_), replacement(replacement_)
|
2016-02-14 04:07:55 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline void WriteBufferValidUTF8::putReplacement()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (replacement.empty() || (group_replacements && just_put_replacement))
|
|
|
|
return;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
just_put_replacement = true;
|
|
|
|
output_buffer.write(replacement.data(), replacement.size());
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (len == 0)
|
|
|
|
return;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
just_put_replacement = false;
|
|
|
|
output_buffer.write(data, len);
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void WriteBufferValidUTF8::nextImpl()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
char * p = memory.data();
|
|
|
|
char * valid_start = p;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (p < pos)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2022-06-15 13:19:29 +00:00
|
|
|
/// Fast skip of ASCII for x86.
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr size_t SIMD_BYTES = 16;
|
|
|
|
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
|
|
|
|
p += SIMD_BYTES;
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2022-06-15 13:19:29 +00:00
|
|
|
if (!(p < pos))
|
|
|
|
break;
|
|
|
|
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
/// Fast skip of ASCII for aarch64.
|
|
|
|
static constexpr size_t SIMD_BYTES = 16;
|
|
|
|
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
|
|
|
|
|
|
|
|
/// Other options include
|
|
|
|
/// vmaxvq_u8(input) < 0b10000000;
|
|
|
|
/// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
|
|
|
|
/// SIMDJSON uses it for 64 byte masks, so it's a little different.
|
|
|
|
/// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
|
|
|
|
/// shrn version has universally <=3 cycles, on servers 2 cycles.
|
|
|
|
while (p < simd_end && onlyASCII(vld1q_u8(reinterpret_cast<const uint8_t *>(p))))
|
|
|
|
p += SIMD_BYTES;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (!(p < pos))
|
|
|
|
break;
|
2016-02-14 04:07:55 +00:00
|
|
|
#endif
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
|
|
|
|
|
|
|
|
if (len > 4)
|
2020-03-18 00:57:00 +00:00
|
|
|
{ // NOLINT
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Invalid start of sequence. Skip one byte.
|
|
|
|
putValid(valid_start, p - valid_start);
|
|
|
|
putReplacement();
|
|
|
|
++p;
|
|
|
|
valid_start = p;
|
|
|
|
}
|
|
|
|
else if (p + len > pos)
|
|
|
|
{
|
|
|
|
/// Sequence was not fully written to this buffer.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
|
|
|
|
{
|
|
|
|
/// Valid sequence.
|
|
|
|
p += len;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Invalid sequence. Skip just first byte.
|
|
|
|
putValid(valid_start, p - valid_start);
|
|
|
|
putReplacement();
|
|
|
|
++p;
|
|
|
|
valid_start = p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
putValid(valid_start, p - valid_start);
|
|
|
|
|
|
|
|
size_t cnt = pos - p;
|
|
|
|
|
|
|
|
/// Shift unfinished sequence to start of buffer.
|
|
|
|
for (size_t i = 0; i < cnt; ++i)
|
|
|
|
memory[i] = p[i];
|
|
|
|
|
2018-09-02 03:00:04 +00:00
|
|
|
working_buffer = Buffer(&memory[cnt], memory.data() + memory.size());
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
WriteBufferValidUTF8::~WriteBufferValidUTF8()
|
|
|
|
{
|
2021-11-11 17:27:23 +00:00
|
|
|
finalize();
|
2021-11-10 22:58:56 +00:00
|
|
|
}
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
void WriteBufferValidUTF8::finalizeImpl()
|
2016-02-14 04:07:55 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Write all complete sequences from buffer.
|
|
|
|
nextImpl();
|
2016-02-14 04:07:55 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// If unfinished sequence at end, then write replacement.
|
|
|
|
if (working_buffer.begin() != memory.data())
|
|
|
|
putReplacement();
|
2016-02-14 04:07:55 +00:00
|
|
|
}
|
|
|
|
|
2012-11-14 09:49:51 +00:00
|
|
|
}
|