From 4a336e381497b5b1b254e404c5578e0d2e829e14 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 7 Sep 2020 07:21:55 +0300 Subject: [PATCH] More consistent invocation of skipBOMIfExists --- src/Dictionaries/ExecutableDictionarySource.cpp | 4 ++-- .../Formats/Impl/JSONEachRowRowInputFormat.cpp | 6 +++--- src/Processors/Formats/Impl/TSKVRowInputFormat.cpp | 12 ++++++++---- src/Processors/Formats/Impl/TSKVRowInputFormat.h | 1 + .../Formats/Impl/ValuesBlockInputFormat.cpp | 11 +++++++++-- src/Processors/Formats/Impl/ValuesBlockInputFormat.h | 1 + 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/Dictionaries/ExecutableDictionarySource.cpp b/src/Dictionaries/ExecutableDictionarySource.cpp index 46df227dd67..918cf0732ab 100644 --- a/src/Dictionaries/ExecutableDictionarySource.cpp +++ b/src/Dictionaries/ExecutableDictionarySource.cpp @@ -101,8 +101,8 @@ BlockInputStreamPtr ExecutableDictionarySource::loadUpdatedAll() namespace { /** A stream, that also runs and waits for background thread - * (that will feed data into pipe to be read from the other side of the pipe). - */ + * (that will feed data into pipe to be read from the other side of the pipe). + */ class BlockInputStreamWithBackgroundThread final : public IBlockInputStream { public: diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 6350db3b211..96a9417d160 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -32,9 +32,6 @@ JSONEachRowRowInputFormat::JSONEachRowRowInputFormat( ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) : IRowInputFormat(header_, in_, std::move(params_)), format_settings(format_settings_), name_map(header_.columns()) { - /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. - skipBOMIfExists(in); - size_t num_columns = getPort().getHeader().columns(); for (size_t i = 0; i < num_columns; ++i) { @@ -285,6 +282,9 @@ void JSONEachRowRowInputFormat::resetParser() void JSONEachRowRowInputFormat::readPrefix() { + /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. + skipBOMIfExists(in); + skipWhitespaceIfAny(in); if (!in.eof() && *in.position() == '[') { diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 86e905344e1..93cd0a623c7 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -19,10 +19,6 @@ namespace ErrorCodes TSKVRowInputFormat::TSKVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const FormatSettings & format_settings_) : IRowInputFormat(std::move(header_), in_, std::move(params_)), format_settings(format_settings_), name_map(header_.columns()) { - /// In this format, we assume that column name cannot contain BOM, - /// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it. - skipBOMIfExists(in); - const auto & sample_block = getPort().getHeader(); size_t num_columns = sample_block.columns(); for (size_t i = 0; i < num_columns; ++i) @@ -30,6 +26,14 @@ TSKVRowInputFormat::TSKVRowInputFormat(ReadBuffer & in_, Block header_, Params p } +void TSKVRowInputFormat::readPrefix() +{ + /// In this format, we assume that column name cannot contain BOM, + /// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it. + skipBOMIfExists(in); +} + + /** Read the field name in the `tskv` format. * Return true if the field is followed by an equal sign, * otherwise (field with no value) return false. diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index d35f2882e6d..bc537158d9b 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -27,6 +27,7 @@ public: String getName() const override { return "TSKVRowInputFormat"; } + void readPrefix() override; bool readRow(MutableColumns & columns, RowReadExtension &) override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 2e2c98c63d2..de5a1b71580 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -35,12 +35,13 @@ ValuesBlockInputFormat::ValuesBlockInputFormat(ReadBuffer & in_, const Block & h attempts_to_deduce_template(num_columns), attempts_to_deduce_template_cached(num_columns), rows_parsed_using_template(num_columns), templates(num_columns), types(header_.getDataTypes()) { - /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. - skipBOMIfExists(buf); } Chunk ValuesBlockInputFormat::generate() { + if (total_rows == 0) + readPrefix(); + const Block & header = getPort().getHeader(); MutableColumns columns = header.cloneEmptyColumns(); block_missing_values.clear(); @@ -405,6 +406,12 @@ bool ValuesBlockInputFormat::shouldDeduceNewTemplate(size_t column_idx) return false; } +void ValuesBlockInputFormat::readPrefix() +{ + /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. + skipBOMIfExists(buf); +} + void ValuesBlockInputFormat::readSuffix() { if (buf.hasUnreadData()) diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index f485870fc69..01deb2865bb 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -63,6 +63,7 @@ private: bool shouldDeduceNewTemplate(size_t column_idx); + void readPrefix(); void readSuffix(); bool skipToNextRow(size_t min_chunk_bytes = 0, int balance = 0);