diff --git a/src/IO/WriteBufferValidUTF8.cpp b/src/IO/WriteBufferValidUTF8.cpp index d611befac37..2a86f8c2801 100644 --- a/src/IO/WriteBufferValidUTF8.cpp +++ b/src/IO/WriteBufferValidUTF8.cpp @@ -54,7 +54,7 @@ inline void WriteBufferValidUTF8::putReplacement() } -inline void WriteBufferValidUTF8::putValid(char *data, size_t len) +inline void WriteBufferValidUTF8::putValid(const char *data, size_t len) { if (len == 0) return; @@ -149,9 +149,34 @@ void WriteBufferValidUTF8::finalizeImpl() /// Write all complete sequences from buffer. nextImpl(); - /// If unfinished sequence at end, then write replacement. + /// Handle remaining bytes if we have an incomplete sequence if (working_buffer.begin() != memory.data()) - putReplacement(); + { + const char * p = memory.data(); + + while (p < pos) + { + UInt8 len = length_of_utf8_sequence[static_cast(*p)]; + if (p + len > pos) + { + /// Incomplete sequence. Skip one byte. + putReplacement(); + ++p; + } + else if (Poco::UTF8Encoding::isLegal(reinterpret_cast(p), len)) + { + /// Valid sequence + putValid(p, len); + p += len; + } + else + { + /// Invalid sequence, skip first byte. + putReplacement(); + ++p; + } + } + } } } diff --git a/src/IO/WriteBufferValidUTF8.h b/src/IO/WriteBufferValidUTF8.h index daaf0427f88..a398b8ded01 100644 --- a/src/IO/WriteBufferValidUTF8.h +++ b/src/IO/WriteBufferValidUTF8.h @@ -26,7 +26,7 @@ public: private: void putReplacement(); - void putValid(char * data, size_t len); + void putValid(const char * data, size_t len); void nextImpl() override; void finalizeImpl() override; diff --git a/tests/queries/0_stateless/03221_incomplete_utf8_sequence.reference b/tests/queries/0_stateless/03221_incomplete_utf8_sequence.reference new file mode 100644 index 00000000000..4577427251d --- /dev/null +++ b/tests/queries/0_stateless/03221_incomplete_utf8_sequence.reference @@ -0,0 +1,16 @@ +{ + "meta": + [ + { + "name": "unhex('f0')", + "type": "String" + } + ], + + "data": + [ + ["�"] + ], + + "rows": 1 +} diff --git a/tests/queries/0_stateless/03221_incomplete_utf8_sequence.sql b/tests/queries/0_stateless/03221_incomplete_utf8_sequence.sql new file mode 100644 index 00000000000..ee4f25f3b4a --- /dev/null +++ b/tests/queries/0_stateless/03221_incomplete_utf8_sequence.sql @@ -0,0 +1,2 @@ +SET output_format_write_statistics = 0; +SELECT unhex('f0') FORMAT JSONCompact;