Text formats: detecting and skipping UTF-8 BOM when possible [#METR-21817].

This commit is contained in:
Alexey Milovidov 2016-06-23 22:39:20 +03:00
parent 590a7b5d73
commit df538babc8
9 changed files with 120 additions and 118 deletions

View File

@ -265,6 +265,7 @@ add_library (dbms
include/DB/DataStreams/MarkInCompressedFile.h
include/DB/DataStreams/CSVRowOutputStream.h
include/DB/DataStreams/CSVRowInputStream.h
include/DB/DataStreams/verbosePrintString.h
include/DB/DataTypes/IDataType.h
include/DB/DataTypes/IDataTypeDummy.h
include/DB/DataTypes/DataTypeSet.h
@ -742,6 +743,7 @@ add_library (dbms
src/DataStreams/DistinctBlockInputStream.cpp
src/DataStreams/RemoteBlockInputStream.cpp
src/DataStreams/BlockIO.cpp
src/DataStreams/verbosePrintString.cpp
src/DataTypes/DataTypeString.cpp
src/DataTypes/DataTypeFixedString.cpp

View File

@ -0,0 +1,12 @@
#pragma once
#include <DB/IO/WriteBuffer.h>
namespace DB
{
/** Print string in double quotes and with control characters in "<NAME>" form - for output diagnostic info to user.
*/
void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out);
}

View File

@ -879,4 +879,21 @@ inline T parse(const String & s)
return parse<T>(s.data(), s.size());
}
/** Skip UTF-8 BOM if it is under cursor.
* As BOM is usually located at start of stream, and buffer size is usually larger than three bytes,
* the function expects, that all three bytes of BOM is fully in buffer (otherwise it don't skip anything).
*/
inline void skipBOMIfExists(ReadBuffer & buf)
{
if (!buf.eof()
&& buf.position() + 3 < buf.buffer().end()
&& buf.position()[0] == '\xEF'
&& buf.position()[1] == '\xBB'
&& buf.position()[2] == '\xBF')
{
buf.position() += 3;
}
}
}

View File

@ -1,6 +1,7 @@
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/Operators.h>
#include <DB/DataStreams/verbosePrintString.h>
#include <DB/DataStreams/CSVRowInputStream.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
@ -96,6 +97,10 @@ static void skipRow(ReadBuffer & istr, const char delimiter, size_t columns)
void CSVRowInputStream::readPrefix()
{
/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
/// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
skipBOMIfExists(istr);
size_t columns = sample.columns();
String tmp;
@ -190,65 +195,6 @@ void CSVRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer & out)
}
static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
{
if (end == begin)
{
out << "<EMPTY>";
return;
}
out << "\"";
for (auto pos = begin; pos < end; ++pos)
{
switch (*pos)
{
case '\0':
out << "<ASCII NUL>";
break;
case '\b':
out << "<BACKSPACE>";
break;
case '\f':
out << "<FORM FEED>";
break;
case '\n':
out << "<LINE FEED>";
break;
case '\r':
out << "<CARRIAGE RETURN>";
break;
case '\t':
out << "<TAB>";
break;
case '\\':
out << "<BACKSLASH>";
break;
case '"':
out << "<DOUBLE QUOTE>";
break;
case '\'':
out << "<SINGLE QUOTE>";
break;
default:
{
if (*pos >= 0 && *pos < 32)
{
static const char * hex = "0123456789ABCDEF";
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
}
else
out << *pos;
}
}
}
out << "\"";
}
bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{

View File

@ -15,6 +15,9 @@ namespace ErrorCodes
JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & sample_)
: istr(istr_), sample(sample_), name_map(sample.columns())
{
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
skipBOMIfExists(istr);
size_t columns = sample.columns();
for (size_t i = 0; i < columns; ++i)
name_map[sample.getByPosition(i).name] = i; /// NOTE Можно было бы расположить имена более кэш-локально.

View File

@ -16,6 +16,10 @@ namespace ErrorCodes
TSKVRowInputStream::TSKVRowInputStream(ReadBuffer & istr_, const Block & sample_, bool skip_unknown_)
: istr(istr_), sample(sample_), skip_unknown(skip_unknown_), name_map(sample.columns())
{
/// In this format, we assume that column name cannot contain BOM,
/// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it.
skipBOMIfExists(istr);
size_t columns = sample.columns();
for (size_t i = 0; i < columns; ++i)
name_map[sample.getByPosition(i).name] = i; /// NOTE Можно было бы расположить имена более кэш-локально.

View File

@ -2,6 +2,7 @@
#include <DB/IO/Operators.h>
#include <DB/DataStreams/TabSeparatedRowInputStream.h>
#include <DB/DataStreams/verbosePrintString.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
@ -29,6 +30,14 @@ void TabSeparatedRowInputStream::readPrefix()
size_t columns = sample.columns();
String tmp;
if (with_names || with_types)
{
/// In this format, we assume that column name or type cannot contain BOM,
/// so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(istr);
}
if (with_names)
{
for (size_t i = 0; i < columns; ++i)
@ -155,65 +164,6 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer
}
static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
{
if (end == begin)
{
out << "<EMPTY>";
return;
}
out << "\"";
for (auto pos = begin; pos < end; ++pos)
{
switch (*pos)
{
case '\0':
out << "<ASCII NUL>";
break;
case '\b':
out << "<BACKSPACE>";
break;
case '\f':
out << "<FORM FEED>";
break;
case '\n':
out << "<LINE FEED>";
break;
case '\r':
out << "<CARRIAGE RETURN>";
break;
case '\t':
out << "<TAB>";
break;
case '\\':
out << "<BACKSLASH>";
break;
case '"':
out << "<DOUBLE QUOTE>";
break;
case '\'':
out << "<SINGLE QUOTE>";
break;
default:
{
if (*pos >= 0 && *pos < 32)
{
static const char * hex = "0123456789ABCDEF";
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
}
else
out << *pos;
}
}
}
out << "\"";
}
bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block,
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{

View File

@ -26,6 +26,8 @@ namespace ErrorCodes
ValuesRowInputStream::ValuesRowInputStream(ReadBuffer & istr_, const Context & context_)
: istr(istr_), context(context_)
{
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
skipBOMIfExists(istr);
}

View File

@ -0,0 +1,66 @@
#include <DB/DataStreams/verbosePrintString.h>
#include <DB/IO/Operators.h>
namespace DB
{
void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
{
if (end == begin)
{
out << "<EMPTY>";
return;
}
out << "\"";
for (auto pos = begin; pos < end; ++pos)
{
switch (*pos)
{
case '\0':
out << "<ASCII NUL>";
break;
case '\b':
out << "<BACKSPACE>";
break;
case '\f':
out << "<FORM FEED>";
break;
case '\n':
out << "<LINE FEED>";
break;
case '\r':
out << "<CARRIAGE RETURN>";
break;
case '\t':
out << "<TAB>";
break;
case '\\':
out << "<BACKSLASH>";
break;
case '"':
out << "<DOUBLE QUOTE>";
break;
case '\'':
out << "<SINGLE QUOTE>";
break;
default:
{
if (*pos >= 0 && *pos < 32)
{
static const char * hex = "0123456789ABCDEF";
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
}
else
out << *pos;
}
}
}
out << "\"";
}
}