mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Text formats: detecting and skipping UTF-8 BOM when possible [#METR-21817].
This commit is contained in:
parent
590a7b5d73
commit
df538babc8
@ -265,6 +265,7 @@ add_library (dbms
|
||||
include/DB/DataStreams/MarkInCompressedFile.h
|
||||
include/DB/DataStreams/CSVRowOutputStream.h
|
||||
include/DB/DataStreams/CSVRowInputStream.h
|
||||
include/DB/DataStreams/verbosePrintString.h
|
||||
include/DB/DataTypes/IDataType.h
|
||||
include/DB/DataTypes/IDataTypeDummy.h
|
||||
include/DB/DataTypes/DataTypeSet.h
|
||||
@ -742,6 +743,7 @@ add_library (dbms
|
||||
src/DataStreams/DistinctBlockInputStream.cpp
|
||||
src/DataStreams/RemoteBlockInputStream.cpp
|
||||
src/DataStreams/BlockIO.cpp
|
||||
src/DataStreams/verbosePrintString.cpp
|
||||
|
||||
src/DataTypes/DataTypeString.cpp
|
||||
src/DataTypes/DataTypeFixedString.cpp
|
||||
|
12
dbms/include/DB/DataStreams/verbosePrintString.h
Normal file
12
dbms/include/DB/DataStreams/verbosePrintString.h
Normal file
@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <DB/IO/WriteBuffer.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** Print string in double quotes and with control characters in "<NAME>" form - for output diagnostic info to user.
|
||||
*/
|
||||
void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out);
|
||||
|
||||
}
|
@ -879,4 +879,21 @@ inline T parse(const String & s)
|
||||
return parse<T>(s.data(), s.size());
|
||||
}
|
||||
|
||||
|
||||
/** Skip UTF-8 BOM if it is under cursor.
|
||||
* As BOM is usually located at start of stream, and buffer size is usually larger than three bytes,
|
||||
* the function expects, that all three bytes of BOM is fully in buffer (otherwise it don't skip anything).
|
||||
*/
|
||||
inline void skipBOMIfExists(ReadBuffer & buf)
|
||||
{
|
||||
if (!buf.eof()
|
||||
&& buf.position() + 3 < buf.buffer().end()
|
||||
&& buf.position()[0] == '\xEF'
|
||||
&& buf.position()[1] == '\xBB'
|
||||
&& buf.position()[2] == '\xBF')
|
||||
{
|
||||
buf.position() += 3;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <DB/IO/ReadHelpers.h>
|
||||
#include <DB/IO/Operators.h>
|
||||
|
||||
#include <DB/DataStreams/verbosePrintString.h>
|
||||
#include <DB/DataStreams/CSVRowInputStream.h>
|
||||
#include <DB/DataTypes/DataTypesNumberFixed.h>
|
||||
|
||||
@ -96,6 +97,10 @@ static void skipRow(ReadBuffer & istr, const char delimiter, size_t columns)
|
||||
|
||||
void CSVRowInputStream::readPrefix()
|
||||
{
|
||||
/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
|
||||
/// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
|
||||
skipBOMIfExists(istr);
|
||||
|
||||
size_t columns = sample.columns();
|
||||
String tmp;
|
||||
|
||||
@ -190,65 +195,6 @@ void CSVRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer & out)
|
||||
}
|
||||
|
||||
|
||||
static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
|
||||
{
|
||||
if (end == begin)
|
||||
{
|
||||
out << "<EMPTY>";
|
||||
return;
|
||||
}
|
||||
|
||||
out << "\"";
|
||||
|
||||
for (auto pos = begin; pos < end; ++pos)
|
||||
{
|
||||
switch (*pos)
|
||||
{
|
||||
case '\0':
|
||||
out << "<ASCII NUL>";
|
||||
break;
|
||||
case '\b':
|
||||
out << "<BACKSPACE>";
|
||||
break;
|
||||
case '\f':
|
||||
out << "<FORM FEED>";
|
||||
break;
|
||||
case '\n':
|
||||
out << "<LINE FEED>";
|
||||
break;
|
||||
case '\r':
|
||||
out << "<CARRIAGE RETURN>";
|
||||
break;
|
||||
case '\t':
|
||||
out << "<TAB>";
|
||||
break;
|
||||
case '\\':
|
||||
out << "<BACKSLASH>";
|
||||
break;
|
||||
case '"':
|
||||
out << "<DOUBLE QUOTE>";
|
||||
break;
|
||||
case '\'':
|
||||
out << "<SINGLE QUOTE>";
|
||||
break;
|
||||
|
||||
default:
|
||||
{
|
||||
if (*pos >= 0 && *pos < 32)
|
||||
{
|
||||
static const char * hex = "0123456789ABCDEF";
|
||||
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
|
||||
}
|
||||
else
|
||||
out << *pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out << "\"";
|
||||
}
|
||||
|
||||
|
||||
bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
|
||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
||||
{
|
||||
|
@ -15,6 +15,9 @@ namespace ErrorCodes
|
||||
JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & sample_)
|
||||
: istr(istr_), sample(sample_), name_map(sample.columns())
|
||||
{
|
||||
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
||||
skipBOMIfExists(istr);
|
||||
|
||||
size_t columns = sample.columns();
|
||||
for (size_t i = 0; i < columns; ++i)
|
||||
name_map[sample.getByPosition(i).name] = i; /// NOTE Можно было бы расположить имена более кэш-локально.
|
||||
|
@ -16,6 +16,10 @@ namespace ErrorCodes
|
||||
TSKVRowInputStream::TSKVRowInputStream(ReadBuffer & istr_, const Block & sample_, bool skip_unknown_)
|
||||
: istr(istr_), sample(sample_), skip_unknown(skip_unknown_), name_map(sample.columns())
|
||||
{
|
||||
/// In this format, we assume that column name cannot contain BOM,
|
||||
/// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it.
|
||||
skipBOMIfExists(istr);
|
||||
|
||||
size_t columns = sample.columns();
|
||||
for (size_t i = 0; i < columns; ++i)
|
||||
name_map[sample.getByPosition(i).name] = i; /// NOTE Можно было бы расположить имена более кэш-локально.
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <DB/IO/Operators.h>
|
||||
|
||||
#include <DB/DataStreams/TabSeparatedRowInputStream.h>
|
||||
#include <DB/DataStreams/verbosePrintString.h>
|
||||
#include <DB/DataTypes/DataTypesNumberFixed.h>
|
||||
|
||||
|
||||
@ -29,6 +30,14 @@ void TabSeparatedRowInputStream::readPrefix()
|
||||
size_t columns = sample.columns();
|
||||
String tmp;
|
||||
|
||||
if (with_names || with_types)
|
||||
{
|
||||
/// In this format, we assume that column name or type cannot contain BOM,
|
||||
/// so, if format has header,
|
||||
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
|
||||
skipBOMIfExists(istr);
|
||||
}
|
||||
|
||||
if (with_names)
|
||||
{
|
||||
for (size_t i = 0; i < columns; ++i)
|
||||
@ -155,65 +164,6 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer
|
||||
}
|
||||
|
||||
|
||||
static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
|
||||
{
|
||||
if (end == begin)
|
||||
{
|
||||
out << "<EMPTY>";
|
||||
return;
|
||||
}
|
||||
|
||||
out << "\"";
|
||||
|
||||
for (auto pos = begin; pos < end; ++pos)
|
||||
{
|
||||
switch (*pos)
|
||||
{
|
||||
case '\0':
|
||||
out << "<ASCII NUL>";
|
||||
break;
|
||||
case '\b':
|
||||
out << "<BACKSPACE>";
|
||||
break;
|
||||
case '\f':
|
||||
out << "<FORM FEED>";
|
||||
break;
|
||||
case '\n':
|
||||
out << "<LINE FEED>";
|
||||
break;
|
||||
case '\r':
|
||||
out << "<CARRIAGE RETURN>";
|
||||
break;
|
||||
case '\t':
|
||||
out << "<TAB>";
|
||||
break;
|
||||
case '\\':
|
||||
out << "<BACKSLASH>";
|
||||
break;
|
||||
case '"':
|
||||
out << "<DOUBLE QUOTE>";
|
||||
break;
|
||||
case '\'':
|
||||
out << "<SINGLE QUOTE>";
|
||||
break;
|
||||
|
||||
default:
|
||||
{
|
||||
if (*pos >= 0 && *pos < 32)
|
||||
{
|
||||
static const char * hex = "0123456789ABCDEF";
|
||||
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
|
||||
}
|
||||
else
|
||||
out << *pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out << "\"";
|
||||
}
|
||||
|
||||
|
||||
bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
|
||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
||||
{
|
||||
|
@ -26,6 +26,8 @@ namespace ErrorCodes
|
||||
ValuesRowInputStream::ValuesRowInputStream(ReadBuffer & istr_, const Context & context_)
|
||||
: istr(istr_), context(context_)
|
||||
{
|
||||
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
||||
skipBOMIfExists(istr);
|
||||
}
|
||||
|
||||
|
||||
|
66
dbms/src/DataStreams/verbosePrintString.cpp
Normal file
66
dbms/src/DataStreams/verbosePrintString.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
#include <DB/DataStreams/verbosePrintString.h>
|
||||
#include <DB/IO/Operators.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
|
||||
{
|
||||
if (end == begin)
|
||||
{
|
||||
out << "<EMPTY>";
|
||||
return;
|
||||
}
|
||||
|
||||
out << "\"";
|
||||
|
||||
for (auto pos = begin; pos < end; ++pos)
|
||||
{
|
||||
switch (*pos)
|
||||
{
|
||||
case '\0':
|
||||
out << "<ASCII NUL>";
|
||||
break;
|
||||
case '\b':
|
||||
out << "<BACKSPACE>";
|
||||
break;
|
||||
case '\f':
|
||||
out << "<FORM FEED>";
|
||||
break;
|
||||
case '\n':
|
||||
out << "<LINE FEED>";
|
||||
break;
|
||||
case '\r':
|
||||
out << "<CARRIAGE RETURN>";
|
||||
break;
|
||||
case '\t':
|
||||
out << "<TAB>";
|
||||
break;
|
||||
case '\\':
|
||||
out << "<BACKSLASH>";
|
||||
break;
|
||||
case '"':
|
||||
out << "<DOUBLE QUOTE>";
|
||||
break;
|
||||
case '\'':
|
||||
out << "<SINGLE QUOTE>";
|
||||
break;
|
||||
|
||||
default:
|
||||
{
|
||||
if (*pos >= 0 && *pos < 32)
|
||||
{
|
||||
static const char * hex = "0123456789ABCDEF";
|
||||
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
|
||||
}
|
||||
else
|
||||
out << *pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out << "\"";
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user