Merge pull request #67938 from mwoenker/incomplete-utf8-sequence

Handle incomplete sequences at end of input
This commit is contained in:
Pablo Marcos 2024-09-10 14:04:58 +00:00 committed by GitHub
commit a34a544f4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 47 additions and 4 deletions

View File

@ -54,7 +54,7 @@ inline void WriteBufferValidUTF8::putReplacement()
} }
inline void WriteBufferValidUTF8::putValid(char *data, size_t len) inline void WriteBufferValidUTF8::putValid(const char *data, size_t len)
{ {
if (len == 0) if (len == 0)
return; return;
@ -149,9 +149,34 @@ void WriteBufferValidUTF8::finalizeImpl()
/// Write all complete sequences from buffer. /// Write all complete sequences from buffer.
nextImpl(); nextImpl();
/// If unfinished sequence at end, then write replacement. /// Handle remaining bytes if we have an incomplete sequence
if (working_buffer.begin() != memory.data()) if (working_buffer.begin() != memory.data())
putReplacement(); {
const char * p = memory.data();
while (p < pos)
{
UInt8 len = length_of_utf8_sequence[static_cast<const unsigned char>(*p)];
if (p + len > pos)
{
/// Incomplete sequence. Skip one byte.
putReplacement();
++p;
}
else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(p), len))
{
/// Valid sequence
putValid(p, len);
p += len;
}
else
{
/// Invalid sequence, skip first byte.
putReplacement();
++p;
}
}
}
} }
} }

View File

@ -26,7 +26,7 @@ public:
private: private:
void putReplacement(); void putReplacement();
void putValid(char * data, size_t len); void putValid(const char * data, size_t len);
void nextImpl() override; void nextImpl() override;
void finalizeImpl() override; void finalizeImpl() override;

View File

@ -0,0 +1,16 @@
{
"meta":
[
{
"name": "unhex('f0')",
"type": "String"
}
],
"data":
[
["<22>"]
],
"rows": 1
}

View File

@ -0,0 +1,2 @@
SET output_format_write_statistics = 0;
SELECT unhex('f0') FORMAT JSONCompact;