From df538babc8a507f00b00be65b3f7f73821b50250 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jun 2016 22:39:20 +0300 Subject: [PATCH] Text formats: detecting and skipping UTF-8 BOM when possible [#METR-21817]. --- dbms/CMakeLists.txt | 2 + .../DB/DataStreams/verbosePrintString.h | 12 ++++ dbms/include/DB/IO/ReadHelpers.h | 17 +++++ dbms/src/DataStreams/CSVRowInputStream.cpp | 64 ++--------------- .../DataStreams/JSONEachRowRowInputStream.cpp | 3 + dbms/src/DataStreams/TSKVRowInputStream.cpp | 4 ++ .../TabSeparatedRowInputStream.cpp | 68 +++---------------- dbms/src/DataStreams/ValuesRowInputStream.cpp | 2 + dbms/src/DataStreams/verbosePrintString.cpp | 66 ++++++++++++++++++ 9 files changed, 120 insertions(+), 118 deletions(-) create mode 100644 dbms/include/DB/DataStreams/verbosePrintString.h create mode 100644 dbms/src/DataStreams/verbosePrintString.cpp diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index e63f21b9dc6..8e0694a8170 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -265,6 +265,7 @@ add_library (dbms include/DB/DataStreams/MarkInCompressedFile.h include/DB/DataStreams/CSVRowOutputStream.h include/DB/DataStreams/CSVRowInputStream.h + include/DB/DataStreams/verbosePrintString.h include/DB/DataTypes/IDataType.h include/DB/DataTypes/IDataTypeDummy.h include/DB/DataTypes/DataTypeSet.h @@ -742,6 +743,7 @@ add_library (dbms src/DataStreams/DistinctBlockInputStream.cpp src/DataStreams/RemoteBlockInputStream.cpp src/DataStreams/BlockIO.cpp + src/DataStreams/verbosePrintString.cpp src/DataTypes/DataTypeString.cpp src/DataTypes/DataTypeFixedString.cpp diff --git a/dbms/include/DB/DataStreams/verbosePrintString.h b/dbms/include/DB/DataStreams/verbosePrintString.h new file mode 100644 index 00000000000..e006d8f022d --- /dev/null +++ b/dbms/include/DB/DataStreams/verbosePrintString.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace DB +{ + +/** Print string in double quotes and with control characters in "" form - for output diagnostic info to user. + */ +void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out); + +} diff --git a/dbms/include/DB/IO/ReadHelpers.h b/dbms/include/DB/IO/ReadHelpers.h index 66bc99e1c59..5671dbe9ca5 100644 --- a/dbms/include/DB/IO/ReadHelpers.h +++ b/dbms/include/DB/IO/ReadHelpers.h @@ -879,4 +879,21 @@ inline T parse(const String & s) return parse(s.data(), s.size()); } + +/** Skip UTF-8 BOM if it is under cursor. + * As BOM is usually located at start of stream, and buffer size is usually larger than three bytes, + * the function expects, that all three bytes of BOM is fully in buffer (otherwise it don't skip anything). + */ +inline void skipBOMIfExists(ReadBuffer & buf) +{ + if (!buf.eof() + && buf.position() + 3 < buf.buffer().end() + && buf.position()[0] == '\xEF' + && buf.position()[1] == '\xBB' + && buf.position()[2] == '\xBF') + { + buf.position() += 3; + } +} + } diff --git a/dbms/src/DataStreams/CSVRowInputStream.cpp b/dbms/src/DataStreams/CSVRowInputStream.cpp index 58183121bfb..bdd514abfaa 100644 --- a/dbms/src/DataStreams/CSVRowInputStream.cpp +++ b/dbms/src/DataStreams/CSVRowInputStream.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -96,6 +97,10 @@ static void skipRow(ReadBuffer & istr, const char delimiter, size_t columns) void CSVRowInputStream::readPrefix() { + /// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes, + /// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it. + skipBOMIfExists(istr); + size_t columns = sample.columns(); String tmp; @@ -190,65 +195,6 @@ void CSVRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer & out) } -static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out) -{ - if (end == begin) - { - out << ""; - return; - } - - out << "\""; - - for (auto pos = begin; pos < end; ++pos) - { - switch (*pos) - { - case '\0': - out << ""; - break; - case '\b': - out << ""; - break; - case '\f': - out << "
"; - break; - case '\n': - out << ""; - break; - case '\r': - out << ""; - break; - case '\t': - out << ""; - break; - case '\\': - out << ""; - break; - case '"': - out << ""; - break; - case '\'': - out << ""; - break; - - default: - { - if (*pos >= 0 && *pos < 32) - { - static const char * hex = "0123456789ABCDEF"; - out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">"; - } - else - out << *pos; - } - } - } - - out << "\""; -} - - bool CSVRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) { diff --git a/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp b/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp index 544af1c76a4..ba530c53355 100644 --- a/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp +++ b/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp @@ -15,6 +15,9 @@ namespace ErrorCodes JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & sample_) : istr(istr_), sample(sample_), name_map(sample.columns()) { + /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. + skipBOMIfExists(istr); + size_t columns = sample.columns(); for (size_t i = 0; i < columns; ++i) name_map[sample.getByPosition(i).name] = i; /// NOTE Можно было бы расположить имена более кэш-локально. diff --git a/dbms/src/DataStreams/TSKVRowInputStream.cpp b/dbms/src/DataStreams/TSKVRowInputStream.cpp index f444dee2d47..d8e47ca235b 100644 --- a/dbms/src/DataStreams/TSKVRowInputStream.cpp +++ b/dbms/src/DataStreams/TSKVRowInputStream.cpp @@ -16,6 +16,10 @@ namespace ErrorCodes TSKVRowInputStream::TSKVRowInputStream(ReadBuffer & istr_, const Block & sample_, bool skip_unknown_) : istr(istr_), sample(sample_), skip_unknown(skip_unknown_), name_map(sample.columns()) { + /// In this format, we assume that column name cannot contain BOM, + /// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it. + skipBOMIfExists(istr); + size_t columns = sample.columns(); for (size_t i = 0; i < columns; ++i) name_map[sample.getByPosition(i).name] = i; /// NOTE Можно было бы расположить имена более кэш-локально. diff --git a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp index 1ed28dab0b4..fbed2fb45d3 100644 --- a/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp +++ b/dbms/src/DataStreams/TabSeparatedRowInputStream.cpp @@ -2,6 +2,7 @@ #include #include +#include #include @@ -29,6 +30,14 @@ void TabSeparatedRowInputStream::readPrefix() size_t columns = sample.columns(); String tmp; + if (with_names || with_types) + { + /// In this format, we assume that column name or type cannot contain BOM, + /// so, if format has header, + /// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it. + skipBOMIfExists(istr); + } + if (with_names) { for (size_t i = 0; i < columns; ++i) @@ -155,65 +164,6 @@ void TabSeparatedRowInputStream::printDiagnosticInfo(Block & block, WriteBuffer } -static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out) -{ - if (end == begin) - { - out << ""; - return; - } - - out << "\""; - - for (auto pos = begin; pos < end; ++pos) - { - switch (*pos) - { - case '\0': - out << ""; - break; - case '\b': - out << ""; - break; - case '\f': - out << ""; - break; - case '\n': - out << ""; - break; - case '\r': - out << ""; - break; - case '\t': - out << ""; - break; - case '\\': - out << ""; - break; - case '"': - out << ""; - break; - case '\'': - out << ""; - break; - - default: - { - if (*pos >= 0 && *pos < 32) - { - static const char * hex = "0123456789ABCDEF"; - out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">"; - } - else - out << *pos; - } - } - } - - out << "\""; -} - - bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(Block & block, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) { diff --git a/dbms/src/DataStreams/ValuesRowInputStream.cpp b/dbms/src/DataStreams/ValuesRowInputStream.cpp index 387b67470a7..11c707756f7 100644 --- a/dbms/src/DataStreams/ValuesRowInputStream.cpp +++ b/dbms/src/DataStreams/ValuesRowInputStream.cpp @@ -26,6 +26,8 @@ namespace ErrorCodes ValuesRowInputStream::ValuesRowInputStream(ReadBuffer & istr_, const Context & context_) : istr(istr_), context(context_) { + /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. + skipBOMIfExists(istr); } diff --git a/dbms/src/DataStreams/verbosePrintString.cpp b/dbms/src/DataStreams/verbosePrintString.cpp new file mode 100644 index 00000000000..29b2e770e4e --- /dev/null +++ b/dbms/src/DataStreams/verbosePrintString.cpp @@ -0,0 +1,66 @@ +#include +#include + + +namespace DB +{ + +void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out) +{ + if (end == begin) + { + out << ""; + return; + } + + out << "\""; + + for (auto pos = begin; pos < end; ++pos) + { + switch (*pos) + { + case '\0': + out << ""; + break; + case '\b': + out << ""; + break; + case '\f': + out << ""; + break; + case '\n': + out << ""; + break; + case '\r': + out << ""; + break; + case '\t': + out << ""; + break; + case '\\': + out << ""; + break; + case '"': + out << ""; + break; + case '\'': + out << ""; + break; + + default: + { + if (*pos >= 0 && *pos < 32) + { + static const char * hex = "0123456789ABCDEF"; + out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">"; + } + else + out << *pos; + } + } + } + + out << "\""; +} + +}