mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Merge pull request #67938 from mwoenker/incomplete-utf8-sequence
Handle incomplete sequences at end of input
This commit is contained in:
commit
a34a544f4a
@ -54,7 +54,7 @@ inline void WriteBufferValidUTF8::putReplacement()
|
||||
}
|
||||
|
||||
|
||||
inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
|
||||
inline void WriteBufferValidUTF8::putValid(const char *data, size_t len)
|
||||
{
|
||||
if (len == 0)
|
||||
return;
|
||||
@ -149,9 +149,34 @@ void WriteBufferValidUTF8::finalizeImpl()
|
||||
/// Write all complete sequences from buffer.
|
||||
nextImpl();
|
||||
|
||||
/// If unfinished sequence at end, then write replacement.
|
||||
/// Handle remaining bytes if we have an incomplete sequence
|
||||
if (working_buffer.begin() != memory.data())
|
||||
putReplacement();
|
||||
{
|
||||
const char * p = memory.data();
|
||||
|
||||
while (p < pos)
|
||||
{
|
||||
UInt8 len = length_of_utf8_sequence[static_cast<const unsigned char>(*p)];
|
||||
if (p + len > pos)
|
||||
{
|
||||
/// Incomplete sequence. Skip one byte.
|
||||
putReplacement();
|
||||
++p;
|
||||
}
|
||||
else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(p), len))
|
||||
{
|
||||
/// Valid sequence
|
||||
putValid(p, len);
|
||||
p += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
/// Invalid sequence, skip first byte.
|
||||
putReplacement();
|
||||
++p;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ public:
|
||||
|
||||
private:
|
||||
void putReplacement();
|
||||
void putValid(char * data, size_t len);
|
||||
void putValid(const char * data, size_t len);
|
||||
|
||||
void nextImpl() override;
|
||||
void finalizeImpl() override;
|
||||
|
@ -0,0 +1,16 @@
|
||||
{
|
||||
"meta":
|
||||
[
|
||||
{
|
||||
"name": "unhex('f0')",
|
||||
"type": "String"
|
||||
}
|
||||
],
|
||||
|
||||
"data":
|
||||
[
|
||||
["<22>"]
|
||||
],
|
||||
|
||||
"rows": 1
|
||||
}
|
@ -0,0 +1,2 @@
|
||||
SET output_format_write_statistics = 0;
|
||||
SELECT unhex('f0') FORMAT JSONCompact;
|
Loading…
Reference in New Issue
Block a user